Skip to content

Commit 3fa0c22

Browse files
committed
refactor inline formatting parser
1 parent 18bb016 commit 3fa0c22

File tree

4 files changed

+839
-160
lines changed

4 files changed

+839
-160
lines changed

.changeset/popular-grapes-cover.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
---
2+
'markdown-to-jsx': major
3+
---
4+
5+
Refactored inline formatting parsing to eliminate ReDoS vulnerabilities and improve performance. The previous regex-based approach was susceptible to exponential backtracking on certain inputs and had several edge case bugs with nested formatting, escaped characters, and formatting inside links. The new implementation uses a custom iterative scanner that runs in O(n) time and is immune to ReDoS attacks.
6+
7+
This also consolidates multiple formatting rule types into a single unified rule with boolean flags, reducing code duplication and bundle size. Performance has improved measurably on simple markdown strings:
8+
9+
```
10+
+--------------------------+------------------------+-----------------------+
11+
| │ simple markdown string │ large markdown string |
12+
+--------------------------+------------------------+-----------------------+
13+
| markdown-to-jsx (next) │ 134,498 ops/sec │ 720 ops/sec |
14+
+--------------------------+------------------------+-----------------------+
15+
| markdown-to-jsx (7.7.15) │ 106,616 ops/sec │ 717 ops/sec |
16+
+--------------------------+------------------------+-----------------------+
17+
```
18+
19+
**Breaking Changes:**
20+
21+
The following `RuleType` enum values have been removed and consolidated into a single `RuleType.textFormatted`:
22+
23+
- `RuleType.textBolded`
24+
- `RuleType.textEmphasized`
25+
- `RuleType.textMarked`
26+
- `RuleType.textStrikethroughed`
27+
28+
If you're using these rule types directly (e.g., for custom AST processing or overrides), you'll need to update your code to check for `RuleType.textFormatted` instead and inspect the node's boolean flags (`bold`, `italic`, `marked`, `strikethrough`) to determine which formatting is applied.

index.tsx

Lines changed: 23 additions & 160 deletions
Original file line numberDiff line numberDiff line change
@@ -7,60 +7,14 @@
77
* optimizations here wouldn't be feasible. 🙏🏼
88
*/
99
import * as React from 'react'
10+
import {
11+
RuleType as RuleTypeConst,
12+
matchInlineFormatting,
13+
type RuleType as RuleTypeValue,
14+
} from './match'
1015

11-
/**
12-
* Analogous to `node.type`. Please note that the values here may change at any time,
13-
* so do not hard code against the value directly.
14-
*/
15-
export const RuleType = {
16-
blockQuote: '0',
17-
breakLine: '1',
18-
breakThematic: '2',
19-
codeBlock: '3',
20-
codeFenced: '4',
21-
codeInline: '5',
22-
footnote: '6',
23-
footnoteReference: '7',
24-
gfmTask: '8',
25-
heading: '9',
26-
headingSetext: '10',
27-
/** only available if not `disableHTMLParsing` */
28-
htmlBlock: '11',
29-
htmlComment: '12',
30-
/** only available if not `disableHTMLParsing` */
31-
htmlSelfClosing: '13',
32-
image: '14',
33-
link: '15',
34-
/** emits a `link` 'node', does not render directly */
35-
linkAngleBraceStyleDetector: '16',
36-
/** emits a `link` 'node', does not render directly */
37-
linkBareUrlDetector: '17',
38-
/** @deprecated merged into linkAngleBraceStyleDetector
39-
*
40-
* emits a `link` 'node', does not render directly */
41-
linkMailtoDetector: '18',
42-
newlineCoalescer: '19',
43-
orderedList: '20',
44-
paragraph: '21',
45-
ref: '22',
46-
refImage: '23',
47-
refLink: '24',
48-
table: '25',
49-
tableSeparator: '26',
50-
text: '27',
51-
textBolded: '28',
52-
textEmphasized: '29',
53-
textEscaped: '30',
54-
textMarked: '31',
55-
textStrikethroughed: '32',
56-
unorderedList: '33',
57-
} as const
58-
59-
if (process.env.NODE_ENV === 'test') {
60-
Object.keys(RuleType).forEach(key => (RuleType[key] = key))
61-
}
62-
63-
export type RuleType = (typeof RuleType)[keyof typeof RuleType]
16+
export const RuleType = RuleTypeConst
17+
export type RuleType = RuleTypeValue
6418

6519
const Priority = {
6620
/**
@@ -298,47 +252,6 @@ const TABLE_CENTER_ALIGN = /^ *:-+: *$/
298252
const TABLE_LEFT_ALIGN = /^ *:-+ *$/
299253
const TABLE_RIGHT_ALIGN = /^ *-+: *$/
300254

301-
/**
302-
* Ensure there's at least one more instance of the delimiter later
303-
* in the current sequence.
304-
*/
305-
const LOOKAHEAD = (double: number) => `(?=[\\s\\S]+?\\1${double ? '\\1' : ''})`
306-
307-
/**
308-
* For inline formatting, this partial attempts to ignore characters that
309-
* may appear in nested formatting that could prematurely trigger detection
310-
* and therefore miss content that should have been included.
311-
*/
312-
const INLINE_SKIP_R =
313-
'((?:\\[.*?\\][([].*?[)\\]]|<.*?>(?:.*?<.*?>)?|`.*?`|\\\\\\1|[\\s\\S])+?)'
314-
315-
/**
316-
* Detect a sequence like **foo** or __foo__. Note that bold has a higher priority
317-
* than emphasized to support nesting of both since they share a delimiter.
318-
*/
319-
const TEXT_BOLD_R = new RegExp(
320-
`^([*_])\\1${LOOKAHEAD(1)}${INLINE_SKIP_R}\\1\\1(?!\\1)`
321-
)
322-
323-
/**
324-
* Detect a sequence like *foo* or _foo_.
325-
*/
326-
const TEXT_EMPHASIZED_R = new RegExp(
327-
`^([*_])${LOOKAHEAD(0)}${INLINE_SKIP_R}\\1(?!\\1)`
328-
)
329-
330-
/**
331-
* Detect a sequence like ==foo==.
332-
*/
333-
const TEXT_MARKED_R = new RegExp(`^(==)${LOOKAHEAD(0)}${INLINE_SKIP_R}\\1`)
334-
335-
/**
336-
* Detect a sequence like ~~foo~~.
337-
*/
338-
const TEXT_STRIKETHROUGHED_R = new RegExp(
339-
`^(~~)${LOOKAHEAD(0)}${INLINE_SKIP_R}\\1`
340-
)
341-
342255
/**
343256
* Special case for shortcodes like :big-smile: or :emoji:
344257
*/
@@ -2036,38 +1949,22 @@ export function compiler(
20361949
},
20371950
},
20381951

2039-
[RuleType.textBolded]: {
2040-
_qualify: ['**', '__'],
2041-
_match: simpleInlineRegex(TEXT_BOLD_R),
1952+
[RuleType.textFormatted]: {
1953+
_qualify: ['*', '_', '~', '='],
1954+
_match: allowInline(matchInlineFormatting),
20421955
_order: Priority.MED,
20431956
_parse(capture, parse, state) {
20441957
return {
2045-
// capture[1] -> the syntax control character
2046-
// capture[2] -> inner content
2047-
children: parse(capture[2], state),
2048-
}
2049-
},
2050-
_render(node, output, state) {
2051-
return <strong key={state.key}>{output(node.children, state)}</strong>
2052-
},
2053-
},
2054-
2055-
[RuleType.textEmphasized]: {
2056-
_qualify: source => {
2057-
const char = source[0]
2058-
return (char === '*' || char === '_') && source[1] !== char
2059-
},
2060-
_match: simpleInlineRegex(TEXT_EMPHASIZED_R),
2061-
_order: Priority.LOW,
2062-
_parse(capture, parse, state) {
2063-
return {
2064-
// capture[1] -> opening * or _
2065-
// capture[2] -> inner content
20661958
children: parse(capture[2], state),
1959+
tag: capture[1],
20671960
}
20681961
},
20691962
_render(node, output, state) {
2070-
return <em key={state.key}>{output(node.children, state)}</em>
1963+
return h(
1964+
node.tag as MarkdownToJSX.HTMLTags,
1965+
{ key: state.key },
1966+
output(node.children, state)
1967+
)
20711968
},
20721969
},
20731970

@@ -2086,26 +1983,6 @@ export function compiler(
20861983
}
20871984
},
20881985
},
2089-
2090-
[RuleType.textMarked]: {
2091-
_qualify: ['=='],
2092-
_match: simpleInlineRegex(TEXT_MARKED_R),
2093-
_order: Priority.LOW,
2094-
_parse: parseCaptureInline,
2095-
_render(node, output, state) {
2096-
return <mark key={state.key}>{output(node.children, state)}</mark>
2097-
},
2098-
},
2099-
2100-
[RuleType.textStrikethroughed]: {
2101-
_qualify: ['~~'],
2102-
_match: simpleInlineRegex(TEXT_STRIKETHROUGHED_R),
2103-
_order: Priority.LOW,
2104-
_parse: parseCaptureInline,
2105-
_render(node, output, state) {
2106-
return <del key={state.key}>{output(node.children, state)}</del>
2107-
},
2108-
},
21091986
}
21101987

21111988
const isDebug = !!process.env.DEBUG && process.env.DEBUG !== '0'
@@ -2484,30 +2361,19 @@ export namespace MarkdownToJSX {
24842361
text: string
24852362
}
24862363

2487-
export interface BoldTextNode {
2488-
type: typeof RuleType.textBolded
2489-
children: MarkdownToJSX.ParserResult[]
2490-
}
2491-
2492-
export interface ItalicTextNode {
2493-
type: typeof RuleType.textEmphasized
2364+
export interface FormattedTextNode {
2365+
type: typeof RuleType.textFormatted
2366+
/**
2367+
* the corresponding html tag
2368+
*/
2369+
tag: string
24942370
children: MarkdownToJSX.ParserResult[]
24952371
}
24962372

24972373
export interface EscapedTextNode {
24982374
type: typeof RuleType.textEscaped
24992375
}
25002376

2501-
export interface MarkedTextNode {
2502-
type: typeof RuleType.textMarked
2503-
children: MarkdownToJSX.ParserResult[]
2504-
}
2505-
2506-
export interface StrikethroughTextNode {
2507-
type: typeof RuleType.textStrikethroughed
2508-
children: MarkdownToJSX.ParserResult[]
2509-
}
2510-
25112377
export interface HTMLNode {
25122378
type: typeof RuleType.htmlBlock
25132379
attrs: React.JSX.IntrinsicAttributes
@@ -2551,11 +2417,8 @@ export namespace MarkdownToJSX {
25512417
| TableNode
25522418
| TableSeparatorNode
25532419
| TextNode
2554-
| BoldTextNode
2555-
| ItalicTextNode
2420+
| FormattedTextNode
25562421
| EscapedTextNode
2557-
| MarkedTextNode
2558-
| StrikethroughTextNode
25592422
| HTMLNode
25602423
| HTMLSelfClosingNode
25612424

0 commit comments

Comments
 (0)