Skip to content

Commit b58fb31

Browse files
authored
Merge pull request #194 from fancy-regex/variable_lookbehind_easy
basic variable-sized lookbehind implementation use regex-automata's DFA reverse search functionality to support variable (even unbounded) length lookbehinds without fancy features or capture groups
2 parents 68d300b + e0829fb commit b58fb31

File tree

8 files changed

+263
-10
lines changed

8 files changed

+263
-10
lines changed

Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@ exclude = ["/.github/*", "/Cargo.lock.msrv"]
1313
rust-version = "1.66"
1414

1515
[features]
16-
default = ["unicode", "perf", "std"]
16+
default = ["unicode", "perf", "std", "variable-lookbehinds"]
1717
# Enable #[track_caller] in unit tests.
1818
track_caller = []
1919
perf = ["regex-automata/perf"]
2020
unicode = ["regex-automata/unicode", "regex-syntax/unicode"]
2121
std = ["regex-automata/std", "regex-syntax/std", "bit-set/std"]
22+
# Enable variable-length lookbehinds using reverse DFA search
23+
variable-lookbehinds = ["regex-automata/dfa-search"]
2224

2325
[dependencies.regex-automata]
2426
version = "0.4"

benches/bench.rs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,51 @@ fn run_backtrack_limit(c: &mut Criterion) {
9797
});
9898
}
9999

100+
#[cfg(feature = "variable-lookbehinds")]
101+
fn const_size_lookbehind(c: &mut Criterion) {
102+
// Benchmark const-size lookbehind (should use simple GoBack)
103+
let tree = Expr::parse_tree(r"(?<=ab)x").unwrap();
104+
let a = analyze(&tree, false).unwrap();
105+
let p = compile(&a, false).unwrap();
106+
let input = "abx";
107+
c.bench_function("const_size_lookbehind", |b| {
108+
b.iter(|| run_default(&p, input, 0).unwrap())
109+
});
110+
}
111+
112+
#[cfg(feature = "variable-lookbehinds")]
113+
fn variable_size_lookbehind(c: &mut Criterion) {
114+
// Benchmark variable-size lookbehind (uses reverse DFA)
115+
let tree = Expr::parse_tree(r"(?<=a+b+)x").unwrap();
116+
let a = analyze(&tree, false).unwrap();
117+
let p = compile(&a, false).unwrap();
118+
let input = "aaabbbbx";
119+
c.bench_function("variable_size_lookbehind", |b| {
120+
b.iter(|| run_default(&p, input, 0).unwrap())
121+
});
122+
}
123+
124+
#[cfg(feature = "variable-lookbehinds")]
125+
fn variable_size_alt_lookbehind(c: &mut Criterion) {
126+
// Benchmark variable-size lookbehind with alternation
127+
let tree = Expr::parse_tree(r"(?<=a|bc)x").unwrap();
128+
let a = analyze(&tree, false).unwrap();
129+
let p = compile(&a, false).unwrap();
130+
let input = "bcx";
131+
c.bench_function("variable_size_alt_lookbehind", |b| {
132+
b.iter(|| run_default(&p, input, 0).unwrap())
133+
});
134+
}
135+
136+
#[cfg(feature = "variable-lookbehinds")]
137+
criterion_group!(
138+
name = lookbehind_benches;
139+
config = Criterion::default();
140+
targets = const_size_lookbehind,
141+
variable_size_lookbehind,
142+
variable_size_alt_lookbehind,
143+
);
144+
100145
criterion_group!(
101146
name = benches;
102147
config = Criterion::default().warm_up_time(Duration::from_secs(10));
@@ -114,4 +159,8 @@ criterion_group!(
114159
targets = run_backtrack_limit,
115160
);
116161

162+
#[cfg(feature = "variable-lookbehinds")]
163+
criterion_main!(benches, slow_benches, lookbehind_benches);
164+
165+
#[cfg(not(feature = "variable-lookbehinds"))]
117166
criterion_main!(benches, slow_benches);

examples/toy.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,26 @@ digraph G {
274274
assert_compiled_prog("a+(?<b>b*)(?=c)\\k<b>", &expected);
275275
}
276276

277+
#[test]
278+
#[cfg(feature = "variable-lookbehinds")]
279+
fn test_compilation_variable_lookbehind_debug_output() {
280+
let expected = " ".to_owned()
281+
+ "\
282+
0: Split(3, 1)
283+
1: Any
284+
2: Jmp(0)
285+
3: Save(0)
286+
4: Save(2)
287+
5: BackwardsDelegate(ReverseBackwardsDelegate { pattern: \"ab+\" })
288+
6: Restore(2)
289+
7: Lit(\"x\")
290+
8: Save(1)
291+
9: End
292+
";
293+
294+
assert_compiled_prog("(?<=ab+)x", &expected);
295+
}
296+
277297
#[test]
278298
fn test_compilation_wrapped_debug_output() {
279299
let expected = "wrapped Regex \"a+bc?\", explicit_capture_group_0: false";

src/compile.rs

Lines changed: 74 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ use regex_automata::meta::{Builder as RaBuilder, Config as RaConfig};
2828
use std::{collections::BTreeMap, sync::RwLock};
2929

3030
use crate::analyze::Info;
31+
#[cfg(feature = "variable-lookbehinds")]
32+
use crate::vm::ReverseBackwardsDelegate;
3133
use crate::vm::{Delegate, Insn, Prog};
3234
use crate::LookAround::*;
3335
use crate::{CompileError, Error, Expr, LookAround, RegexOptions, Result};
@@ -444,12 +446,52 @@ impl Compiler {
444446

445447
fn compile_lookaround_inner(&mut self, inner: &Info<'_>, la: LookAround) -> Result<()> {
446448
if la == LookBehind || la == LookBehindNeg {
447-
if !inner.const_size {
448-
return Err(Error::CompileError(CompileError::LookBehindNotConst));
449+
if inner.const_size {
450+
self.b.add(Insn::GoBack(inner.min_size));
451+
self.visit(inner, false)
452+
} else if !inner.hard && inner.start_group == inner.end_group {
453+
#[cfg(feature = "variable-lookbehinds")]
454+
{
455+
let mut delegate_builder = DelegateBuilder::new();
456+
delegate_builder.push(inner);
457+
let pattern = &delegate_builder.re;
458+
// Use reverse matching for variable-sized lookbehinds without fancy features
459+
use regex_automata::nfa::thompson;
460+
// Build a reverse DFA for the pattern
461+
let dfa = match regex_automata::hybrid::dfa::DFA::builder()
462+
.thompson(thompson::Config::new().reverse(true))
463+
.build(&pattern)
464+
{
465+
Ok(dfa) => dfa,
466+
Err(e) => {
467+
return Err(Error::CompileError(CompileError::DfaBuildError(
468+
e.to_string(),
469+
)))
470+
}
471+
};
472+
473+
let cache = core::cell::RefCell::new(dfa.create_cache());
474+
self.b
475+
.add(Insn::BackwardsDelegate(ReverseBackwardsDelegate {
476+
dfa,
477+
cache,
478+
pattern: pattern.to_string(),
479+
}));
480+
Ok(())
481+
}
482+
#[cfg(not(feature = "variable-lookbehinds"))]
483+
{
484+
Err(Error::CompileError(
485+
CompileError::VariableLookBehindRequiresFeature,
486+
))
487+
}
488+
} else {
489+
// variable sized lookbehinds with fancy features are currently unsupported
490+
Err(Error::CompileError(CompileError::LookBehindNotConst))
449491
}
450-
self.b.add(Insn::GoBack(inner.min_size));
492+
} else {
493+
self.visit(inner, false)
451494
}
452-
self.visit(inner, false)
453495
}
454496

455497
fn compile_delegates(&mut self, infos: &[Info<'_>]) -> Result<()> {
@@ -748,6 +790,34 @@ mod tests {
748790
assert_matches!(prog[8], End);
749791
}
750792

793+
#[test]
794+
#[cfg(not(feature = "variable-lookbehinds"))]
795+
fn variable_lookbehind_requires_feature() {
796+
// Without the feature flag, variable-length lookbehinds should error
797+
let tree = Expr::parse_tree(r"(?<=ab+)x").unwrap();
798+
let info = analyze(&tree, true).unwrap();
799+
let result = compile(&info, true);
800+
assert!(result.is_err());
801+
assert_matches!(
802+
result.err().unwrap(),
803+
Error::CompileError(CompileError::VariableLookBehindRequiresFeature)
804+
);
805+
}
806+
807+
#[test]
808+
#[cfg(feature = "variable-lookbehinds")]
809+
fn variable_lookbehind_with_required_feature() {
810+
let prog = compile_prog(r"(?<=ab+)x");
811+
812+
assert_eq!(prog.len(), 5, "prog: {:?}", prog);
813+
814+
assert_matches!(prog[0], Save(0));
815+
assert_matches!(&prog[1], BackwardsDelegate(ReverseBackwardsDelegate { pattern, dfa: _, cache: _ }) if pattern == "ab+");
816+
assert_matches!(prog[2], Restore(0));
817+
assert_matches!(prog[3], Lit(ref l) if l == "x");
818+
assert_matches!(prog[4], End);
819+
}
820+
751821
fn compile_prog(re: &str) -> Vec<Insn> {
752822
let tree = Expr::parse_tree(re).unwrap();
753823
let info = analyze(&tree, true).unwrap();

src/error.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@ pub enum CompileError {
6565
InnerError(RaBuildError),
6666
/// Look-behind assertion without constant size
6767
LookBehindNotConst,
68+
/// Variable-length lookbehind requires feature flag
69+
VariableLookBehindRequiresFeature,
70+
/// Error building reverse DFA for variable-length lookbehind
71+
DfaBuildError(String),
6872
/// Couldn't parse group name
6973
InvalidGroupName,
7074
/// Invalid group id in escape sequence
@@ -130,6 +134,12 @@ impl fmt::Display for CompileError {
130134
CompileError::LookBehindNotConst => {
131135
write!(f, "Look-behind assertion without constant size")
132136
},
137+
CompileError::VariableLookBehindRequiresFeature => {
138+
write!(f, "Variable-length lookbehind requires the 'variable-lookbehinds' feature")
139+
},
140+
CompileError::DfaBuildError(s) => {
141+
write!(f, "Failed to build reverse DFA for variable-length lookbehind: {}", s)
142+
},
133143
CompileError::InvalidGroupName => write!(f, "Could not parse group name"),
134144
CompileError::InvalidGroupNameBackref(s) => write!(f, "Invalid group name in back reference: {}", s),
135145
CompileError::InvalidBackref(g) => write!(f, "Invalid back reference to group {}", g),

src/lib.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,18 @@ let fields: Vec<&str> = re.splitn(target, 3).map(|x| x.unwrap()).collect();
100100
assert_eq!(fields, vec!["a", "b", "c\td e"]);
101101
```
102102
103+
# Features
104+
105+
This crate supports several optional features that can be enabled or disabled:
106+
107+
- **`std`** (enabled by default): Enables standard library support. Disable for `no_std` environments.
108+
- **`unicode`** (enabled by default): Enables Unicode support for character classes and word boundaries.
109+
- **`perf`** (enabled by default): Enables performance optimizations in the underlying regex engine.
110+
- **`variable-lookbehinds`** (enabled by default): Enables support for variable-length lookbehind
111+
assertions (e.g., `(?<=a+)`). Without this feature, only constant-length lookbehinds are supported.
112+
This feature uses reverse DFA matching from the `regex-automata` crate to efficiently handle
113+
variable-length patterns that don't use backreferences or other fancy features.
114+
103115
# Syntax
104116
105117
The regex syntax is based on the [regex] crate's, with some additional supported syntax.
@@ -150,6 +162,11 @@ Look-around assertions for matching without changing the current position:
150162
`(?<!exp)`
151163
: negative look-behind, succeeds if *exp* doesn't match to the left
152164
165+
**Note**: Look-behind assertions with variable length (e.g., `(?<=a+)`) are supported with the
166+
`variable-lookbehinds` feature (enabled by default). Without this feature, only constant-length
167+
look-behinds are supported. Variable-length look-behinds with backreferences or other "fancy"
168+
features are not currently supported.
169+
153170
Atomic groups using `(?>exp)` to prevent backtracking within `exp`, e.g.:
154171
155172
```

src/vm.rs

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,34 @@ impl core::fmt::Debug for Delegate {
131131
}
132132
}
133133

134+
#[cfg(feature = "variable-lookbehinds")]
135+
#[derive(Clone)]
136+
/// Delegate matching in reverse to regex-automata
137+
pub struct ReverseBackwardsDelegate {
138+
/// The regex pattern as a string which will be matched in reverse, in a backwards direction
139+
pub pattern: String,
140+
/// The delegate regex to match backwards
141+
pub(crate) dfa: regex_automata::hybrid::dfa::DFA,
142+
/// Cache for DFA searches
143+
pub(crate) cache: core::cell::RefCell<regex_automata::hybrid::dfa::Cache>,
144+
}
145+
146+
#[cfg(feature = "variable-lookbehinds")]
147+
impl core::fmt::Debug for ReverseBackwardsDelegate {
148+
fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
149+
// Ensures it fails to compile if the struct changes
150+
let Self {
151+
pattern,
152+
dfa: _,
153+
cache: _,
154+
} = self;
155+
156+
f.debug_struct("ReverseBackwardsDelegate")
157+
.field("pattern", pattern)
158+
.finish()
159+
}
160+
}
161+
134162
/// Instruction of the VM.
135163
#[derive(Clone, Debug)]
136164
pub enum Insn {
@@ -220,6 +248,9 @@ pub enum Insn {
220248
ContinueFromPreviousMatchEnd,
221249
/// Continue only if the specified capture group has already been populated as part of the match
222250
BackrefExistsCondition(usize),
251+
#[cfg(feature = "variable-lookbehinds")]
252+
/// Reverse lookbehind using regex-automata for variable-sized patterns
253+
BackwardsDelegate(ReverseBackwardsDelegate),
223254
}
224255

225256
/// Sequence of instructions for the VM to execute.
@@ -733,6 +764,22 @@ pub(crate) fn run(
733764
break 'fail;
734765
}
735766
}
767+
#[cfg(feature = "variable-lookbehinds")]
768+
Insn::BackwardsDelegate(ReverseBackwardsDelegate {
769+
ref dfa,
770+
ref cache,
771+
pattern: _,
772+
}) => {
773+
// Use regex-automata to search backwards from current position
774+
let mut cache = cache.borrow_mut();
775+
let input = Input::new(s).anchored(Anchored::Yes).range(0..ix);
776+
777+
let found_match = matches!(dfa.try_search_rev(&mut cache, &input), Ok(Some(_)));
778+
779+
if !found_match {
780+
break 'fail;
781+
}
782+
}
736783
Insn::BeginAtomic => {
737784
let count = state.backtrack_count();
738785
state.stack_push(count);

tests/finding.rs

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,12 @@ fn lookbehind_grouping_single_expression() {
5050

5151
#[test]
5252
fn lookbehind_variable_sized_alt() {
53+
// These get compiled into multiple alternative lookbehinds
5354
assert_eq!(find(r"(?<=a|bc)", "xxa"), Some((3, 3)));
5455
assert_eq!(find(r"(?<=a|bc)", "xxbc"), Some((4, 4)));
5556
assert_eq!(find(r"(?<=a|bc)", "xx"), None);
5657
assert_eq!(find(r"(?<=a|bc)", "xxb"), None);
5758
assert_eq!(find(r"(?<=a|bc)", "xxc"), None);
58-
59-
assert!(Regex::new(r"(?<=a(?:b|cd))").is_err());
60-
assert!(Regex::new(r"(?<=a+b+))").is_err());
6159
}
6260

6361
#[test]
@@ -69,9 +67,49 @@ fn negative_lookbehind_variable_sized_alt() {
6967
assert_eq!(find(r"(?<!a|bc)x", "cx"), Some((1, 2)));
7068
assert_eq!(find(r"(?<!a|bc)x", "ax"), None);
7169
assert_eq!(find(r"(?<!a|bc)x", "bcx"), None);
70+
}
71+
72+
#[test]
73+
#[cfg(feature = "variable-lookbehinds")]
74+
fn lookbehind_positive_variable_sized_functionality() {
75+
assert_eq!(find(r"(?<=a(?:b|cd))x", "abx"), Some((2, 3)));
76+
assert_eq!(find(r"(?<=a(?:b|cd))x", "acdx"), Some((3, 4)));
77+
assert_eq!(find(r"(?<=a(?:b|cd))x", "ax"), None);
78+
assert_eq!(find(r"(?<=a(?:b|cd))x", "bcx"), None);
79+
80+
assert_eq!(find(r"(?<=a+b+)x", "abx"), Some((2, 3)));
81+
assert_eq!(find(r"(?<=a+b+)x", "aabbx"), Some((4, 5)));
82+
assert_eq!(find(r"(?<=a+b+)x", "aaabbbx"), Some((6, 7)));
83+
assert_eq!(find(r"(?<=a+b+)x", "ax"), None);
84+
assert_eq!(find(r"(?<=a+b+)x", "bx"), None);
85+
assert_eq!(find(r"(?<=a+b+)x", "abcx"), None);
86+
assert_eq!(find(r"(?<=a+b+)c", "bc aabbc"), Some((7, 8)));
87+
88+
assert_eq!(find(r"\b(?<=\|\s{0,9})(?:[gG]pu)\b", "|gpu"), Some((1, 4)));
89+
assert_eq!(
90+
find(r"\b(?<=\|\s{0,9})(?:[gG]pu)\b", "| gpu"),
91+
Some((3, 6))
92+
);
93+
}
94+
95+
#[test]
96+
#[cfg(feature = "variable-lookbehinds")]
97+
fn lookbehind_negative_variable_sized_functionality() {
98+
assert_eq!(find(r"(?<!a(?:b|cd))x", "abx"), None);
99+
assert_eq!(find(r"(?<!a(?:b|cd))x", "acdx"), None);
100+
assert_eq!(find(r"(?<!a(?:b|cd))x", "ax"), Some((1, 2)));
101+
assert_eq!(find(r"(?<!a(?:b|cd))x", "bcx"), Some((2, 3)));
102+
103+
assert_eq!(find(r"(?<!a+b+)x", "abx"), None);
104+
assert_eq!(find(r"(?<!a+b+)x", "aabbx"), None);
105+
assert_eq!(find(r"(?<!a+b+)x", "aaabbbx"), None);
106+
assert_eq!(find(r"(?<!a+b+)x", "ax"), Some((1, 2)));
107+
assert_eq!(find(r"(?<!a+b+)x", "bx"), Some((1, 2)));
108+
assert_eq!(find(r"(?<!a+b+)x", "abcx"), Some((3, 4)));
109+
assert_eq!(find(r"(?<!a+b+)c", "aabbc bc"), Some((7, 8)));
72110

73-
assert!(Regex::new(r"(?<!a(?:b|cd))").is_err());
74-
assert!(Regex::new(r"(?<!a+b+)").is_err());
111+
assert_eq!(find(r"\b(?<!\|\s{0,9})(?:[gG]pu)\b", "|gpu"), None);
112+
assert_eq!(find(r"\b(?<!\|\s{0,9})(?:[gG]pu)\b", "| gpu"), None);
75113
}
76114

77115
#[test]

0 commit comments

Comments
 (0)