Plugin Directory

Changeset 3484551


Ignore:
Timestamp:
03/17/2026 08:19:04 AM (2 weeks ago)
Author:
intufind
Message:

Release intufind v1.4.0

Location:
intufind
Files:
36 added
3 edited

Legend:

Unmodified
Added
Removed
  • intufind/trunk/includes/class-intufind-content-extractor.php

    r3463908 r3484551  
    6565
    6666        // Primary content.
    67         $primary_content = $this->clean_content( $post->post_content );
     67        $primary_content = $this->html_to_markdown( $post->post_content );
    6868        if ( ! empty( $primary_content ) ) {
    6969            $content_parts[] = $primary_content;
     
    138138
    139139        // Product description.
    140         $description = $this->clean_content( $product->get_description() );
     140        $description = $this->html_to_markdown( $product->get_description() );
    141141        if ( ! empty( $description ) ) {
    142142            $content_parts[] = $description;
     
    349349
    350350            case 'wysiwyg':
    351                 return $this->clean_content( $field_value );
     351                return $this->html_to_markdown( $field_value );
    352352
    353353            case 'select':
     
    631631     */
    632632    private function extract_divi_content( $post ) {
    633         // Divi content is in shortcodes - clean the content to get text.
    634633        $content = $post->post_content;
    635634
     
    638637        $content = preg_replace( '/\[\/et_pb_[^\]]*\]/', '', $content );
    639638
    640         return $this->clean_content( $content );
     639        return $this->html_to_markdown( $content );
    641640    }
    642641
     
    758757
    759758    /**
    760      * Clean content by stripping HTML, shortcodes, and extra whitespace.
     759     * Convert HTML content to markdown, preserving document structure
     760     * (headings, lists, bold, italic, links) for AI comprehension.
     761     *
     762     * @param string $content Raw HTML content.
     763     * @return string Markdown-formatted content.
     764     */
     765    private function html_to_markdown( $content ) {
     766        if ( empty( $content ) ) {
     767            return '';
     768        }
     769
     770        $content = strip_shortcodes( $content );
     771
     772        // Block-level elements: convert before stripping tags.
     773        // Headings.
     774        $content = preg_replace_callback(
     775            '/<h([1-6])[^>]*>(.*?)<\/h\1>/si',
     776            function ( $m ) {
     777                return "\n\n" . str_repeat( '#', (int) $m[1] ) . ' ' . trim( wp_strip_all_tags( $m[2] ) ) . "\n\n";
     778            },
     779            $content
     780        );
     781
     782        // Blockquotes.
     783        $content = preg_replace_callback(
     784            '/<blockquote[^>]*>(.*?)<\/blockquote>/si',
     785            function ( $m ) {
     786                $text  = trim( wp_strip_all_tags( $m[1] ) );
     787                $lines = explode( "\n", $text );
     788                return "\n\n" . implode( "\n", array_map( fn( $l ) => '> ' . trim( $l ), $lines ) ) . "\n\n";
     789            },
     790            $content
     791        );
     792
     793        // List items — unordered.
     794        $content = preg_replace_callback(
     795            '/<ul[^>]*>(.*?)<\/ul>/si',
     796            function ( $m ) {
     797                $items = array();
     798                preg_match_all( '/<li[^>]*>(.*?)<\/li>/si', $m[1], $li );
     799                foreach ( $li[1] as $item ) {
     800                    $items[] = '- ' . trim( wp_strip_all_tags( $item ) );
     801                }
     802                return "\n\n" . implode( "\n", $items ) . "\n\n";
     803            },
     804            $content
     805        );
     806
     807        // List items — ordered.
     808        $content = preg_replace_callback(
     809            '/<ol[^>]*>(.*?)<\/ol>/si',
     810            function ( $m ) {
     811                $items = array();
     812                $n     = 1;
     813                preg_match_all( '/<li[^>]*>(.*?)<\/li>/si', $m[1], $li );
     814                foreach ( $li[1] as $item ) {
     815                    $items[] = $n . '. ' . trim( wp_strip_all_tags( $item ) );
     816                    ++$n;
     817                }
     818                return "\n\n" . implode( "\n", $items ) . "\n\n";
     819            },
     820            $content
     821        );
     822
     823        // Paragraphs and line breaks.
     824        $content = preg_replace( '/<\/p>\s*<p[^>]*>/si', "\n\n", $content );
     825        $content = preg_replace( '/<p[^>]*>/si', "\n\n", $content );
     826        $content = preg_replace( '/<\/p>/si', "\n\n", $content );
     827        $content = preg_replace( '/<br\s*\/?>/si', "\n", $content );
     828        $content = preg_replace( '/<hr\s*\/?>/si', "\n\n---\n\n", $content );
     829
     830        // Inline elements.
     831        $content = preg_replace( '/<(strong|b)[^>]*>(.*?)<\/\1>/si', '**$2**', $content );
     832        $content = preg_replace( '/<(em|i)[^>]*>(.*?)<\/\1>/si', '*$2*', $content );
     833        $content = preg_replace( '/<code[^>]*>(.*?)<\/code>/si', '`$1`', $content );
     834
     835        // Links.
     836        $content = preg_replace_callback(
     837            '/<a[^>]+href=["\']([^"\']+)["\'][^>]*>(.*?)<\/a>/si',
     838            function ( $m ) {
     839                $text = trim( wp_strip_all_tags( $m[2] ) );
     840                return $text ? "[$text]($m[1])" : '';
     841            },
     842            $content
     843        );
     844
     845        // Images — extract alt text.
     846        $content = preg_replace_callback(
     847            '/<img[^>]+alt=["\']([^"\']*)["\'][^>]*>/si',
     848            function ( $m ) {
     849                return ! empty( $m[1] ) ? $m[1] : '';
     850            },
     851            $content
     852        );
     853
     854        // Strip remaining HTML tags.
     855        $content = wp_strip_all_tags( $content );
     856
     857        // Decode HTML entities.
     858        $content = html_entity_decode( $content, ENT_QUOTES | ENT_HTML5, 'UTF-8' );
     859
     860        // Normalize whitespace: collapse 3+ newlines to 2, trim lines.
     861        $content = preg_replace( '/\n{3,}/', "\n\n", $content );
     862        $content = preg_replace( '/[ \t]+/', ' ', $content );
     863        $content = preg_replace( '/^ +| +$/m', '', $content );
     864
     865        return trim( $content );
     866    }
     867
     868    /**
     869     * Strip HTML to plain text for excerpts and short fields.
    761870     *
    762871     * @param string $content Raw content.
    763      * @return string Cleaned content.
     872     * @return string Plain text.
    764873     */
    765874    private function clean_content( $content ) {
     
    768877        }
    769878
    770         // Strip shortcodes.
    771879        $content = strip_shortcodes( $content );
    772 
    773         // Strip HTML tags.
    774880        $content = wp_strip_all_tags( $content );
    775 
    776         // Decode HTML entities.
    777881        $content = html_entity_decode( $content, ENT_QUOTES | ENT_HTML5, 'UTF-8' );
    778 
    779         // Normalize whitespace.
    780882        $content = preg_replace( '/\s+/', ' ', $content );
    781883
  • intufind/trunk/intufind.php

    r3476603 r3484551  
    44 * Plugin URI: https://intufind.com/integrations/wordpress
    55 * Description: AI-powered search and chat for WordPress. Syncs your content to the cloud for semantic search, intelligent recommendations, and conversational AI.
    6  * Version: 1.3.0
     6 * Version: 1.4.0
    77 * Requires at least: 6.0
    88 * Requires PHP: 8.0
     
    2626 * Plugin constants.
    2727 */
    28 define( 'INTUFIND_VERSION', '1.3.0' );
     28define( 'INTUFIND_VERSION', '1.4.0' );
    2929define( 'INTUFIND_PLUGIN_FILE', __FILE__ );
    3030define( 'INTUFIND_PLUGIN_DIR', plugin_dir_path( __FILE__ ) );
  • intufind/trunk/readme.txt

    r3476603 r3484551  
    55Tested up to: 6.9
    66Requires PHP: 8.0
    7 Stable tag: 1.3.0
     7Stable tag: 1.4.0
    88WC tested up to: 9.6
    99License: GPLv2 or later
     
    215215== Changelog ==
    216216
     217= 1.4.0 =
     218* Content sync now preserves document structure (headings, lists, bold, links) as markdown for better AI search and chat quality
     219* Added html_to_markdown() converter for rich content fields (post content, product descriptions, ACF WYSIWYG, Divi builder)
     220* Retained plain-text extraction for short fields like excerpts and text-only ACF fields
     221
    217222= 1.3.0 =
    218223* Added FacetWP integration — FacetWP Search facets now use Intufind semantic search automatically
     
    299304== Upgrade Notice ==
    300305
     306= 1.4.0 =
     307Content sync now converts HTML to markdown, preserving headings, lists, and formatting for significantly better AI search and chat responses. Re-sync recommended after updating.
     308
    301309= 1.3.0 =
    302310Adds native FacetWP support. FacetWP Search facets automatically use Intufind semantic search when the Search Override is enabled — no configuration needed.
Note: See TracChangeset for help on using the changeset viewer.