Changeset 3480063
- Timestamp:
- 03/11/2026 12:31:30 PM (4 weeks ago)
- Location:
- mescio-for-agents/trunk
- Files:
-
- 4 edited
-
includes/class-llms-endpoints.php (modified) (1 diff)
-
includes/class-markdown-generator.php (modified) (1 diff)
-
mescio-for-agents.php (modified) (5 diffs)
-
readme.txt (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
-
mescio-for-agents/trunk/includes/class-llms-endpoints.php
r3480034 r3480063 23 23 * /llms-full.txt → index.php?mescio_llms_txt=-full 24 24 * 25 * Also registers a variant with a leading language prefix so that 26 * multilingual sites using WPML, Polylang, or similar plugins that 27 * prepend /it/, /en/, etc. to every URL still serve the files correctly. 28 * 25 29 * Called on `init` (every request) and from the activation hook. 26 30 */ 27 31 public static function register_rewrite(): void { 32 // Standard: /llms.txt and /llms-full.txt 28 33 add_rewrite_rule( '^llms(|-full)\.txt$', 'index.php?' . self::QUERY_VAR . '=$matches[1]', 'top' ); 34 // Multilingual prefix: /it/llms.txt, /en/llms-full.txt, etc. 35 add_rewrite_rule( '^[a-z]{2}(-[a-z]{2})?/llms(|-full)\.txt$', 'index.php?' . self::QUERY_VAR . '=$matches[2]', 'top' ); 36 } 37 38 /** 39 * Early intercept on `parse_request` for multilingual sites where 40 * language plugins (WPML, Polylang) rewrite REQUEST_URI before our 41 * rewrite rules run. Catches both /llms.txt and /it/llms.txt forms. 42 * 43 * Hooked on `parse_request` (priority 1, before language plugins). 44 */ 45 public static function early_intercept(): void { 46 // phpcs:ignore WordPress.Security.ValidatedSanitizedInput 47 $uri = isset( $_SERVER['REQUEST_URI'] ) ? sanitize_text_field( wp_unslash( $_SERVER['REQUEST_URI'] ) ) : ''; 48 49 // Strip query string and surrounding slashes 50 $path = trim( strtok( $uri, '?' ), '/' ); 51 // Strip optional language prefix like "it", "en", "pt-br" 52 $path = preg_replace( '#^[a-z]{2}(-[a-z]{2})?/#', '', $path ); 53 54 if ( $path === 'llms.txt' ) { 55 set_query_var( self::QUERY_VAR, '' ); 56 } elseif ( $path === 'llms-full.txt' ) { 57 set_query_var( self::QUERY_VAR, '-full' ); 58 } 29 59 } 30 60 -
mescio-for-agents/trunk/includes/class-markdown-generator.php
r3480034 r3480063 111 111 */ 112 112 public static function html_to_markdown( string $html ): string { 113 // Remove page-builder noise 114 $html = preg_replace( '/\[et_pb_.+?\[\/et_pb_.+?\]/s', '', $html ); // Divi 115 $html = preg_replace( '/\[vc_.+?\]/s', '', $html ); // WPBakery 116 $html = preg_replace( '/\[fl_.+?\]/s', '', $html ); // Beaver Builder 113 // ── Strip page-builder shortcodes BEFORE do_shortcode ──────────────── 114 // These builders output massive CSS/attribute blobs that are useless 115 // for AI agents. We strip both the opening/closing tags AND any content 116 // that is purely structural (wrapper rows, columns, sections). 117 // 118 // Strategy: 119 // 1. Remove self-closing and opening tags for known structural prefixes. 120 // 2. Remove the matching closing tags. 121 // 3. Run do_shortcode() only for whatever non-builder shortcodes remain 122 // (e.g. [gallery], [caption], custom plugin shortcodes). 123 124 // WPBakery (Visual Composer) — prefixes: vc_ 125 // Strip self-closing first, then wrap pairs [vc_*]…[/vc_*] recursively. 126 // We loop up to 10 times to handle deeply nested structures. 127 for ( $i = 0; $i < 10; $i++ ) { 128 $before = $html; 129 // Self-closing: [vc_single_image ... /] or just [vc_tag ...] 130 $html = preg_replace( '/\[vc_[^\]]*\/\]/s', '', $html ); 131 // Opening + content + closing pairs 132 $html = preg_replace( '/\[vc_[^\]]*\].*?\[\/vc_[^\]]*\]/s', '', $html ); 133 // Orphaned opening tags (no matching close) 134 $html = preg_replace( '/\[vc_[^\]]*\]/', '', $html ); 135 // Orphaned closing tags 136 $html = preg_replace( '/\[\/vc_[^\]]*\]/', '', $html ); 137 if ( $html === $before ) break; 138 } 139 140 // Divi — prefixes: et_pb_ 141 for ( $i = 0; $i < 10; $i++ ) { 142 $before = $html; 143 $html = preg_replace( '/\[et_pb_[^\]]*\/\]/s', '', $html ); 144 $html = preg_replace( '/\[et_pb_[^\]]*\].*?\[\/et_pb_[^\]]*\]/s', '', $html ); 145 $html = preg_replace( '/\[et_pb_[^\]]*\]/', '', $html ); 146 $html = preg_replace( '/\[\/et_pb_[^\]]*\]/', '', $html ); 147 if ( $html === $before ) break; 148 } 149 150 // Beaver Builder — prefixes: fl_ 151 for ( $i = 0; $i < 10; $i++ ) { 152 $before = $html; 153 $html = preg_replace( '/\[fl_[^\]]*\/\]/s', '', $html ); 154 $html = preg_replace( '/\[fl_[^\]]*\].*?\[\/fl_[^\]]*\]/s', '', $html ); 155 $html = preg_replace( '/\[fl_[^\]]*\]/', '', $html ); 156 $html = preg_replace( '/\[\/fl_[^\]]*\]/', '', $html ); 157 if ( $html === $before ) break; 158 } 159 160 // Fusion Builder (Avada) — prefixes: fusion_ 161 for ( $i = 0; $i < 10; $i++ ) { 162 $before = $html; 163 $html = preg_replace( '/\[fusion_[^\]]*\/\]/s', '', $html ); 164 $html = preg_replace( '/\[fusion_[^\]]*\].*?\[\/fusion_[^\]]*\]/s', '', $html ); 165 $html = preg_replace( '/\[fusion_[^\]]*\]/', '', $html ); 166 $html = preg_replace( '/\[\/fusion_[^\]]*\]/', '', $html ); 167 if ( $html === $before ) break; 168 } 169 170 // SiteOrigin Page Builder — prefixes: sow_, so_ 171 $html = preg_replace( '/\[\/?so[w]?_[^\]]*\]/s', '', $html ); 172 173 // Cornerstone / X Theme — prefixes: cs_, x_ 174 for ( $i = 0; $i < 10; $i++ ) { 175 $before = $html; 176 $html = preg_replace( '/\[cs_[^\]]*\].*?\[\/cs_[^\]]*\]/s', '', $html ); 177 $html = preg_replace( '/\[\/?cs_[^\]]*\]/', '', $html ); 178 $html = preg_replace( '/\[x_[^\]]*\].*?\[\/x_[^\]]*\]/s', '', $html ); 179 $html = preg_replace( '/\[\/?x_[^\]]*\]/', '', $html ); 180 if ( $html === $before ) break; 181 } 182 183 // WoodMart theme shortcodes — prefixes: woodmart_ 184 for ( $i = 0; $i < 10; $i++ ) { 185 $before = $html; 186 $html = preg_replace( '/\[woodmart_[^\]]*\].*?\[\/woodmart_[^\]]*\]/s', '', $html ); 187 $html = preg_replace( '/\[\/?woodmart_[^\]]*\]/', '', $html ); 188 if ( $html === $before ) break; 189 } 190 191 // Generic safety net: any remaining shortcode with base64-looking attributes 192 // (very long strings of alphanumeric chars, typical of builder blobs like 193 // responsive_spacing="eyJ...") → strip the entire shortcode tag. 194 $html = preg_replace( '/\[[^\]]{200,}\]/', '', $html ); 195 196 // Now run do_shortcode() for legitimate shortcodes (gallery, caption, etc.) 117 197 $html = do_shortcode( $html ); 198 199 // Strip <style> and <script> blocks injected by page builders after do_shortcode. 200 $html = preg_replace( '/<style[^>]*>.*?<\/style>/is', '', $html ); 201 $html = preg_replace( '/<script[^>]*>.*?<\/script>/is', '', $html ); 118 202 119 203 // Strip Elementor structural wrapper <div> tags (keep inner content). -
mescio-for-agents/trunk/mescio-for-agents.php
r3480039 r3480063 4 4 * Plugin URI: https://wordpress.org/plugins/mescio-for-agents/ 5 5 * Description: Mescio for Agents serves your posts, pages and WooCommerce products as clean Markdown to AI agents and GPT crawlers — using HTTP content negotiation (Accept: text/markdown). Human visitors never notice a thing. 6 * Version: 1.6. 16 * Version: 1.6.2 7 7 * Requires at least: 6.0 8 8 * Requires PHP: 8.0 … … 52 52 53 53 /** Plugin version — must match the Version header above. */ 54 const VERSION = '1.6. 1';54 const VERSION = '1.6.2'; 55 55 56 56 /** Post types served by default (filterable via mescio_enabled_post_types). */ … … 65 65 ); 66 66 67 // Content negotiation (Accept: text/markdown on any singular) 68 add_action( 'wp', [ __CLASS__, 'maybe_serve_markdown' ], 1 ); 67 // Content negotiation (Accept: text/markdown on any singular). 68 // Using template_redirect (fires after full WP routing, including WPML/Polylang) 69 // ensures is_singular() is reliable on multilingual sites. 70 add_action( 'template_redirect', [ __CLASS__, 'maybe_serve_markdown' ], 1 ); 69 71 70 72 // REST API … … 76 78 add_filter( 'query_vars', [ 'Mescio_For_Agents_Llms', 'add_query_var' ] ); 77 79 add_action( 'template_redirect', [ 'Mescio_For_Agents_Llms', 'serve' ] ); 80 // Early intercept for multilingual sites (WPML, Polylang) — priority 1 81 add_action( 'parse_request', [ 'Mescio_For_Agents_Llms', 'early_intercept' ], 1 ); 78 82 79 83 // agents.txt (IETF draft-srijal-agents-policy-00) … … 97 101 */ 98 102 public static function maybe_serve_markdown(): void { 99 if ( ! is_singular() ) {100 return;101 }102 103 if ( ! self::client_accepts_markdown() ) { 103 104 return; 104 105 } 105 106 106 $post = get_queried_object(); 107 // Primary path: standard singular page resolved by WP_Query. 108 $post = null; 109 if ( is_singular() ) { 110 $queried = get_queried_object(); 111 if ( $queried instanceof WP_Post ) { 112 $post = $queried; 113 } 114 } 115 116 // Fallback for multilingual sites (WPML, Polylang, TranslatePress): 117 // language plugins may redirect/rewrite the URL so WP_Query does not 118 // resolve the page as singular. Resolve the post from REQUEST_URI directly. 119 if ( ! $post ) { 120 // phpcs:ignore WordPress.Security.ValidatedSanitizedInput 121 $uri = isset( $_SERVER['REQUEST_URI'] ) ? sanitize_text_field( wp_unslash( $_SERVER['REQUEST_URI'] ) ) : ''; 122 $full_url = home_url( strtok( $uri, '?' ) ); 123 $post_id = url_to_postid( $full_url ); 124 if ( $post_id ) { 125 $post = get_post( $post_id ); 126 } 127 } 128 107 129 if ( ! ( $post instanceof WP_Post ) ) { 108 130 return; -
mescio-for-agents/trunk/readme.txt
r3480039 r3480063 5 5 Tested up to: 6.9 6 6 Requires PHP: 8.0 7 Stable tag: 1.6. 17 Stable tag: 1.6.2 8 8 License: GPLv2 or later 9 9 License URI: https://www.gnu.org/licenses/gpl-2.0.html … … 133 133 == Changelog == 134 134 135 = 1.6.2 = 136 * Fix: page builder shortcodes (WPBakery, WoodMart, Divi, ecc.) ora rimossi correttamente prima dell'esecuzione — elimina CSS/attributi blob dall'output Markdown 137 * Fix: content negotiation (Accept: text/markdown) ora funziona correttamente su siti multilingua con WPML/Polylang 138 135 139 = 1.6.1 = 136 * Clean versioning 140 * Fixed 404 on multilingual sites using WPML or Polylang: `llms.txt` and `llms-full.txt` now resolve correctly under language-prefixed URLs (e.g. `/it/llms.txt`, `/en/llms-full.txt`) 141 * Added `parse_request` early intercept (priority 1) as fallback for language plugins that rewrite REQUEST_URI before WordPress rewrite rules run 142 * Added rewrite rule variant matching `/xx/llms.txt` and `/xx-XX/llms.txt` patterns 137 143 138 144 = 1.6.0 =
Note: See TracChangeset
for help on using the changeset viewer.