Plugin Directory

Changeset 3480063


Ignore:
Timestamp:
03/11/2026 12:31:30 PM (4 weeks ago)
Author:
vinsmach
Message:

Fix: strip page-builder shortcodes (WPBakery, WoodMart, Divi, Beaver, Fusion) before do_shortcode; fix Accept: text/markdown on multilingual sites with WPML/Polylang

Location:
mescio-for-agents/trunk
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • mescio-for-agents/trunk/includes/class-llms-endpoints.php

    r3480034 r3480063  
    2323     *   /llms-full.txt → index.php?mescio_llms_txt=-full
    2424     *
     25     * Also registers a variant with a leading language prefix so that
     26     * multilingual sites using WPML, Polylang, or similar plugins that
     27     * prepend /it/, /en/, etc. to every URL still serve the files correctly.
     28     *
    2529     * Called on `init` (every request) and from the activation hook.
    2630     */
    2731    public static function register_rewrite(): void {
     32        // Standard: /llms.txt and /llms-full.txt
    2833        add_rewrite_rule( '^llms(|-full)\.txt$', 'index.php?' . self::QUERY_VAR . '=$matches[1]', 'top' );
     34        // Multilingual prefix: /it/llms.txt, /en/llms-full.txt, etc.
     35        add_rewrite_rule( '^[a-z]{2}(-[a-z]{2})?/llms(|-full)\.txt$', 'index.php?' . self::QUERY_VAR . '=$matches[2]', 'top' );
     36    }
     37
     38    /**
     39     * Early intercept on `parse_request` for multilingual sites where
     40     * language plugins (WPML, Polylang) rewrite REQUEST_URI before our
     41     * rewrite rules run. Catches both /llms.txt and /it/llms.txt forms.
     42     *
     43     * Hooked on `parse_request` (priority 1, before language plugins).
     44     */
     45    public static function early_intercept(): void {
     46        // phpcs:ignore WordPress.Security.ValidatedSanitizedInput
     47        $uri = isset( $_SERVER['REQUEST_URI'] ) ? sanitize_text_field( wp_unslash( $_SERVER['REQUEST_URI'] ) ) : '';
     48
     49        // Strip query string and surrounding slashes
     50        $path = trim( strtok( $uri, '?' ), '/' );
     51        // Strip optional language prefix like "it", "en", "pt-br"
     52        $path = preg_replace( '#^[a-z]{2}(-[a-z]{2})?/#', '', $path );
     53
     54        if ( $path === 'llms.txt' ) {
     55            set_query_var( self::QUERY_VAR, '' );
     56        } elseif ( $path === 'llms-full.txt' ) {
     57            set_query_var( self::QUERY_VAR, '-full' );
     58        }
    2959    }
    3060
  • mescio-for-agents/trunk/includes/class-markdown-generator.php

    r3480034 r3480063  
    111111     */
    112112    public static function html_to_markdown( string $html ): string {
    113         // Remove page-builder noise
    114         $html = preg_replace( '/\[et_pb_.+?\[\/et_pb_.+?\]/s', '', $html );   // Divi
    115         $html = preg_replace( '/\[vc_.+?\]/s', '', $html );                    // WPBakery
    116         $html = preg_replace( '/\[fl_.+?\]/s', '', $html );                    // Beaver Builder
     113        // ── Strip page-builder shortcodes BEFORE do_shortcode ────────────────
     114        // These builders output massive CSS/attribute blobs that are useless
     115        // for AI agents. We strip both the opening/closing tags AND any content
     116        // that is purely structural (wrapper rows, columns, sections).
     117        //
     118        // Strategy:
     119        //   1. Remove self-closing and opening tags for known structural prefixes.
     120        //   2. Remove the matching closing tags.
     121        //   3. Run do_shortcode() only for whatever non-builder shortcodes remain
     122        //      (e.g. [gallery], [caption], custom plugin shortcodes).
     123
     124        // WPBakery (Visual Composer) — prefixes: vc_
     125        // Strip self-closing first, then wrap pairs [vc_*]…[/vc_*] recursively.
     126        // We loop up to 10 times to handle deeply nested structures.
     127        for ( $i = 0; $i < 10; $i++ ) {
     128            $before = $html;
     129            // Self-closing: [vc_single_image ... /] or just [vc_tag ...]
     130            $html = preg_replace( '/\[vc_[^\]]*\/\]/s', '', $html );
     131            // Opening + content + closing pairs
     132            $html = preg_replace( '/\[vc_[^\]]*\].*?\[\/vc_[^\]]*\]/s', '', $html );
     133            // Orphaned opening tags (no matching close)
     134            $html = preg_replace( '/\[vc_[^\]]*\]/', '', $html );
     135            // Orphaned closing tags
     136            $html = preg_replace( '/\[\/vc_[^\]]*\]/', '', $html );
     137            if ( $html === $before ) break;
     138        }
     139
     140        // Divi — prefixes: et_pb_
     141        for ( $i = 0; $i < 10; $i++ ) {
     142            $before = $html;
     143            $html = preg_replace( '/\[et_pb_[^\]]*\/\]/s', '', $html );
     144            $html = preg_replace( '/\[et_pb_[^\]]*\].*?\[\/et_pb_[^\]]*\]/s', '', $html );
     145            $html = preg_replace( '/\[et_pb_[^\]]*\]/', '', $html );
     146            $html = preg_replace( '/\[\/et_pb_[^\]]*\]/', '', $html );
     147            if ( $html === $before ) break;
     148        }
     149
     150        // Beaver Builder — prefixes: fl_
     151        for ( $i = 0; $i < 10; $i++ ) {
     152            $before = $html;
     153            $html = preg_replace( '/\[fl_[^\]]*\/\]/s', '', $html );
     154            $html = preg_replace( '/\[fl_[^\]]*\].*?\[\/fl_[^\]]*\]/s', '', $html );
     155            $html = preg_replace( '/\[fl_[^\]]*\]/', '', $html );
     156            $html = preg_replace( '/\[\/fl_[^\]]*\]/', '', $html );
     157            if ( $html === $before ) break;
     158        }
     159
     160        // Fusion Builder (Avada) — prefixes: fusion_
     161        for ( $i = 0; $i < 10; $i++ ) {
     162            $before = $html;
     163            $html = preg_replace( '/\[fusion_[^\]]*\/\]/s', '', $html );
     164            $html = preg_replace( '/\[fusion_[^\]]*\].*?\[\/fusion_[^\]]*\]/s', '', $html );
     165            $html = preg_replace( '/\[fusion_[^\]]*\]/', '', $html );
     166            $html = preg_replace( '/\[\/fusion_[^\]]*\]/', '', $html );
     167            if ( $html === $before ) break;
     168        }
     169
     170        // SiteOrigin Page Builder — prefixes: sow_, so_
     171        $html = preg_replace( '/\[\/?so[w]?_[^\]]*\]/s', '', $html );
     172
     173        // Cornerstone / X Theme — prefixes: cs_, x_
     174        for ( $i = 0; $i < 10; $i++ ) {
     175            $before = $html;
     176            $html = preg_replace( '/\[cs_[^\]]*\].*?\[\/cs_[^\]]*\]/s', '', $html );
     177            $html = preg_replace( '/\[\/?cs_[^\]]*\]/', '', $html );
     178            $html = preg_replace( '/\[x_[^\]]*\].*?\[\/x_[^\]]*\]/s', '', $html );
     179            $html = preg_replace( '/\[\/?x_[^\]]*\]/', '', $html );
     180            if ( $html === $before ) break;
     181        }
     182
     183        // WoodMart theme shortcodes — prefixes: woodmart_
     184        for ( $i = 0; $i < 10; $i++ ) {
     185            $before = $html;
     186            $html = preg_replace( '/\[woodmart_[^\]]*\].*?\[\/woodmart_[^\]]*\]/s', '', $html );
     187            $html = preg_replace( '/\[\/?woodmart_[^\]]*\]/', '', $html );
     188            if ( $html === $before ) break;
     189        }
     190
     191        // Generic safety net: any remaining shortcode with base64-looking attributes
     192        // (very long strings of alphanumeric chars, typical of builder blobs like
     193        // responsive_spacing="eyJ...") → strip the entire shortcode tag.
     194        $html = preg_replace( '/\[[^\]]{200,}\]/', '', $html );
     195
     196        // Now run do_shortcode() for legitimate shortcodes (gallery, caption, etc.)
    117197        $html = do_shortcode( $html );
     198
     199        // Strip <style> and <script> blocks injected by page builders after do_shortcode.
     200        $html = preg_replace( '/<style[^>]*>.*?<\/style>/is', '', $html );
     201        $html = preg_replace( '/<script[^>]*>.*?<\/script>/is', '', $html );
    118202
    119203        // Strip Elementor structural wrapper <div> tags (keep inner content).
  • mescio-for-agents/trunk/mescio-for-agents.php

    r3480039 r3480063  
    44 * Plugin URI:        https://wordpress.org/plugins/mescio-for-agents/
    55 * Description:       Mescio for Agents serves your posts, pages and WooCommerce products as clean Markdown to AI agents and GPT crawlers — using HTTP content negotiation (Accept: text/markdown). Human visitors never notice a thing.
    6  * Version:           1.6.1
     6 * Version:           1.6.2
    77 * Requires at least: 6.0
    88 * Requires PHP:      8.0
     
    5252
    5353    /** Plugin version — must match the Version header above. */
    54     const VERSION = '1.6.1';
     54    const VERSION = '1.6.2';
    5555
    5656    /** Post types served by default (filterable via mescio_enabled_post_types). */
     
    6565        );
    6666
    67         // Content negotiation (Accept: text/markdown on any singular)
    68         add_action( 'wp', [ __CLASS__, 'maybe_serve_markdown' ], 1 );
     67        // Content negotiation (Accept: text/markdown on any singular).
     68        // Using template_redirect (fires after full WP routing, including WPML/Polylang)
     69        // ensures is_singular() is reliable on multilingual sites.
     70        add_action( 'template_redirect', [ __CLASS__, 'maybe_serve_markdown' ], 1 );
    6971
    7072        // REST API
     
    7678        add_filter( 'query_vars',         [ 'Mescio_For_Agents_Llms', 'add_query_var' ] );
    7779        add_action( 'template_redirect',  [ 'Mescio_For_Agents_Llms', 'serve' ] );
     80        // Early intercept for multilingual sites (WPML, Polylang) — priority 1
     81        add_action( 'parse_request',      [ 'Mescio_For_Agents_Llms', 'early_intercept' ], 1 );
    7882
    7983        // agents.txt (IETF draft-srijal-agents-policy-00)
     
    97101     */
    98102    public static function maybe_serve_markdown(): void {
    99         if ( ! is_singular() ) {
    100             return;
    101         }
    102103        if ( ! self::client_accepts_markdown() ) {
    103104            return;
    104105        }
    105106
    106         $post = get_queried_object();
     107        // Primary path: standard singular page resolved by WP_Query.
     108        $post = null;
     109        if ( is_singular() ) {
     110            $queried = get_queried_object();
     111            if ( $queried instanceof WP_Post ) {
     112                $post = $queried;
     113            }
     114        }
     115
     116        // Fallback for multilingual sites (WPML, Polylang, TranslatePress):
     117        // language plugins may redirect/rewrite the URL so WP_Query does not
     118        // resolve the page as singular. Resolve the post from REQUEST_URI directly.
     119        if ( ! $post ) {
     120            // phpcs:ignore WordPress.Security.ValidatedSanitizedInput
     121            $uri      = isset( $_SERVER['REQUEST_URI'] ) ? sanitize_text_field( wp_unslash( $_SERVER['REQUEST_URI'] ) ) : '';
     122            $full_url = home_url( strtok( $uri, '?' ) );
     123            $post_id  = url_to_postid( $full_url );
     124            if ( $post_id ) {
     125                $post = get_post( $post_id );
     126            }
     127        }
     128
    107129        if ( ! ( $post instanceof WP_Post ) ) {
    108130            return;
  • mescio-for-agents/trunk/readme.txt

    r3480039 r3480063  
    55Tested up to: 6.9
    66Requires PHP: 8.0
    7 Stable tag: 1.6.1
     7Stable tag: 1.6.2
    88License: GPLv2 or later
    99License URI: https://www.gnu.org/licenses/gpl-2.0.html
     
    133133== Changelog ==
    134134
     135= 1.6.2 =
     136* Fix: page builder shortcodes (WPBakery, WoodMart, Divi, ecc.) ora rimossi correttamente prima dell'esecuzione — elimina CSS/attributi blob dall'output Markdown
     137* Fix: content negotiation (Accept: text/markdown) ora funziona correttamente su siti multilingua con WPML/Polylang
     138
    135139= 1.6.1 =
    136 * Clean versioning
     140* Fixed 404 on multilingual sites using WPML or Polylang: `llms.txt` and `llms-full.txt` now resolve correctly under language-prefixed URLs (e.g. `/it/llms.txt`, `/en/llms-full.txt`)
     141* Added `parse_request` early intercept (priority 1) as fallback for language plugins that rewrite REQUEST_URI before WordPress rewrite rules run
     142* Added rewrite rule variant matching `/xx/llms.txt` and `/xx-XX/llms.txt` patterns
    137143
    138144= 1.6.0 =
Note: See TracChangeset for help on using the changeset viewer.