Skip to content

Commit b3dc6bc

Browse files
authored
Implement (contextual) keywords and use their versioning from v2 (#723)
Closes #568 There is still one outstanding issue where we return a `Vec<TokenKind>` from `next_token`; it'd like to return a more specialized type and ideally pass it on stack (2x2 bytes), rather than on-heap (extra 3x8 bytes for the Vec handle + indirection). We should name it better and properly show that we can return at most 2 token kinds (single token kind or identifier + kw combo). To do: - [x] Return tokens from `next_token` via stack Apart from that, I think this is a more correct approach than #598, especially accounting for the new keyword definition format in DSL v2. The main change is that we only check the keyword trie and additionally the (newly introduced) compound keyword scanners only after the token has been lexed as an identifier. For each context, we collect Identifier scanners used by the keywords and attempt promotion there. The existing lexing performance is not impacted from what I've seen when running the sanctuary tests and I can verify (incl. CST tests) that we now properly parse source that uses contextual keywords (e.g. `from`) and that the compound keywords (e.g. `ufixedMxN`) are properly versioned. This adapts the existing `codegen_grammar` interface that's a leftover from DSLv1; I did that to work on finishing #638; once this is merged and we now properly parse contextual keywords, I'll move to clean it up and reduce the parser codegen indirection (right now we go from v2 -> v1 model -> code generator -> Tera templates; it'd like to at least cut out the v1 model and/or simplify visiting v2 from the existing `CodeGenerator`). Please excuse the WIP comments in the middle; the first and the last ones should make sense when reviewing. I can simplify this a bit for review, if needed.
1 parent 662a672 commit b3dc6bc

93 files changed

Lines changed: 9493 additions & 5616 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.changeset/dry-turtles-rhyme.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@nomicfoundation/slang": minor
3+
---
4+
5+
Properly parse unreserved keywords in an identifier position, i.e. `from`, `emit`, `global` etc.

crates/codegen/grammar/src/grammar.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use semver::Version;
44

55
use crate::parser_definition::{ParserDefinitionRef, TriviaParserDefinitionRef};
66
use crate::visitor::{GrammarVisitor, Visitable};
7-
use crate::{PrecedenceParserDefinitionRef, ScannerDefinitionRef};
7+
use crate::{KeywordScannerDefinitionRef, PrecedenceParserDefinitionRef, ScannerDefinitionRef};
88

99
pub struct Grammar {
1010
pub name: String,
@@ -36,6 +36,7 @@ impl Grammar {
3636
#[derive(Clone)]
3737
pub enum GrammarElement {
3838
ScannerDefinition(ScannerDefinitionRef),
39+
KeywordScannerDefinition(KeywordScannerDefinitionRef),
3940
TriviaParserDefinition(TriviaParserDefinitionRef),
4041
ParserDefinition(ParserDefinitionRef),
4142
PrecedenceParserDefinition(PrecedenceParserDefinitionRef),
@@ -45,6 +46,7 @@ impl GrammarElement {
4546
pub fn name(&self) -> &'static str {
4647
match self {
4748
Self::ScannerDefinition(scanner) => scanner.name(),
49+
Self::KeywordScannerDefinition(scanner) => scanner.name(),
4850
Self::TriviaParserDefinition(trivia_parser) => trivia_parser.name(),
4951
Self::ParserDefinition(parser) => parser.name(),
5052
Self::PrecedenceParserDefinition(precedence_parser) => precedence_parser.name(),
@@ -80,6 +82,7 @@ impl Visitable for GrammarElement {
8082
fn accept_visitor<V: GrammarVisitor>(&self, visitor: &mut V) {
8183
match self {
8284
Self::ScannerDefinition(scanner) => scanner.accept_visitor(visitor),
85+
Self::KeywordScannerDefinition(scanner) => scanner.accept_visitor(visitor),
8386
Self::TriviaParserDefinition(trivia_parser) => trivia_parser.accept_visitor(visitor),
8487
Self::ParserDefinition(parser) => parser.accept_visitor(visitor),
8588
Self::PrecedenceParserDefinition(precedence_parser) => {

crates/codegen/grammar/src/parser_definition.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@ use std::fmt::Debug;
22
use std::rc::Rc;
33

44
use crate::visitor::{GrammarVisitor, Visitable};
5-
use crate::{PrecedenceParserDefinitionRef, ScannerDefinitionRef, VersionQualityRange};
5+
use crate::{
6+
KeywordScannerDefinitionRef, PrecedenceParserDefinitionRef, ScannerDefinitionRef,
7+
VersionQualityRange,
8+
};
69

710
/// A named wrapper, used to give a name to a [`ParserDefinitionNode`].
811
#[derive(Clone, Debug)]
@@ -59,6 +62,7 @@ pub enum ParserDefinitionNode {
5962
Sequence(Vec<Named<Self>>),
6063
Choice(Named<Vec<Self>>),
6164
ScannerDefinition(ScannerDefinitionRef),
65+
KeywordScannerDefinition(KeywordScannerDefinitionRef),
6266
TriviaParserDefinition(TriviaParserDefinitionRef),
6367
ParserDefinition(ParserDefinitionRef),
6468
PrecedenceParserDefinition(PrecedenceParserDefinitionRef),
@@ -128,6 +132,7 @@ impl Visitable for ParserDefinitionNode {
128132
}
129133

130134
Self::ScannerDefinition(_)
135+
| Self::KeywordScannerDefinition(_)
131136
| Self::TriviaParserDefinition(_)
132137
| Self::ParserDefinition(_)
133138
| Self::PrecedenceParserDefinition(_) => {}

crates/codegen/grammar/src/scanner_definition.rs

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,98 @@ impl Visitable for ScannerDefinitionNode {
6565
}
6666
}
6767
}
68+
69+
pub trait KeywordScannerDefinition: Debug {
70+
fn name(&self) -> &'static str;
71+
fn identifier_scanner(&self) -> &'static str;
72+
fn definitions(&self) -> &[KeywordScannerDefinitionVersionedNode];
73+
}
74+
75+
pub type KeywordScannerDefinitionRef = Rc<dyn KeywordScannerDefinition>;
76+
77+
impl Visitable for KeywordScannerDefinitionRef {
78+
fn accept_visitor<V: GrammarVisitor>(&self, visitor: &mut V) {
79+
visitor.keyword_scanner_definition_enter(self);
80+
}
81+
}
82+
83+
#[derive(Debug)]
84+
pub struct KeywordScannerDefinitionVersionedNode {
85+
// Underlying keyword scanner (i.e. identifier scanner)
86+
pub value: KeywordScannerDefinitionNode,
87+
/// When the keyword scanner is enabled
88+
pub enabled: Vec<VersionQualityRange>,
89+
/// When the keyword is reserved, i.e. can't be used in other position (e.g. as a name)
90+
pub reserved: Vec<VersionQualityRange>,
91+
}
92+
93+
#[derive(Clone, Debug)]
94+
pub enum KeywordScannerDefinitionNode {
95+
Optional(Box<Self>),
96+
Sequence(Vec<Self>),
97+
Choice(Vec<Self>),
98+
Atom(String),
99+
// No repeatable combinators, because keywords are assumed to be finite
100+
}
101+
102+
impl From<KeywordScannerDefinitionNode> for ScannerDefinitionNode {
103+
fn from(val: KeywordScannerDefinitionNode) -> Self {
104+
match val {
105+
KeywordScannerDefinitionNode::Optional(node) => {
106+
ScannerDefinitionNode::Optional(Box::new((*node).into()))
107+
}
108+
KeywordScannerDefinitionNode::Sequence(nodes) => {
109+
ScannerDefinitionNode::Sequence(nodes.into_iter().map(Into::into).collect())
110+
}
111+
KeywordScannerDefinitionNode::Atom(string) => ScannerDefinitionNode::Literal(string),
112+
KeywordScannerDefinitionNode::Choice(nodes) => {
113+
ScannerDefinitionNode::Choice(nodes.into_iter().map(Into::into).collect())
114+
}
115+
}
116+
}
117+
}
118+
119+
/// A [`KeywordScannerDefinitionRef`] that only has a single atom value.
120+
///
121+
/// The main usage for this type is to construct a keyword trie in parser generator, as trie will
122+
/// only work with single atom values and keyword promotion needs to additionally account for
123+
/// keyword reservation, rather than just literal presence.
124+
#[derive(Clone)]
125+
pub struct KeywordScannerAtomic(KeywordScannerDefinitionRef);
126+
127+
impl KeywordScannerAtomic {
128+
/// Wraps the keyword scanner definition if it is a single atom value.
129+
pub fn try_from_def(def: &KeywordScannerDefinitionRef) -> Option<Self> {
130+
match def.definitions() {
131+
[KeywordScannerDefinitionVersionedNode {
132+
value: KeywordScannerDefinitionNode::Atom(_),
133+
..
134+
}] => Some(Self(def.clone())),
135+
_ => None,
136+
}
137+
}
138+
}
139+
140+
impl std::ops::Deref for KeywordScannerAtomic {
141+
type Target = KeywordScannerDefinitionRef;
142+
143+
fn deref(&self) -> &Self::Target {
144+
&self.0
145+
}
146+
}
147+
148+
impl KeywordScannerAtomic {
149+
pub fn definition(&self) -> &KeywordScannerDefinitionVersionedNode {
150+
let def = &self.0.definitions().get(0);
151+
def.expect("KeywordScannerAtomic should have exactly one definition")
152+
}
153+
pub fn value(&self) -> &str {
154+
match self.definition() {
155+
KeywordScannerDefinitionVersionedNode {
156+
value: KeywordScannerDefinitionNode::Atom(atom),
157+
..
158+
} => atom,
159+
_ => unreachable!("KeywordScannerAtomic should have a single atom value"),
160+
}
161+
}
162+
}

crates/codegen/grammar/src/visitor.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
use crate::{
2-
Grammar, ParserDefinitionNode, ParserDefinitionRef, PrecedenceParserDefinitionNode,
3-
PrecedenceParserDefinitionRef, ScannerDefinitionNode, ScannerDefinitionRef,
4-
TriviaParserDefinitionRef,
2+
Grammar, KeywordScannerDefinitionRef, ParserDefinitionNode, ParserDefinitionRef,
3+
PrecedenceParserDefinitionNode, PrecedenceParserDefinitionRef, ScannerDefinitionNode,
4+
ScannerDefinitionRef, TriviaParserDefinitionRef,
55
};
66

77
pub trait GrammarVisitor {
88
fn grammar_enter(&mut self, _grammar: &Grammar) {}
99
fn grammar_leave(&mut self, _grammar: &Grammar) {}
1010

1111
fn scanner_definition_enter(&mut self, _scanner: &ScannerDefinitionRef) {}
12+
fn keyword_scanner_definition_enter(&mut self, _scanner: &KeywordScannerDefinitionRef) {}
1213
fn trivia_parser_definition_enter(&mut self, _trivia_parser: &TriviaParserDefinitionRef) {}
1314
fn parser_definition_enter(&mut self, _parser: &ParserDefinitionRef) {}
1415
fn precedence_parser_definition_enter(&mut self, _parser: &PrecedenceParserDefinitionRef) {}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
use codegen_grammar::{
2+
KeywordScannerDefinitionNode, KeywordScannerDefinitionRef, ScannerDefinitionNode,
3+
};
4+
use proc_macro2::TokenStream;
5+
use quote::{format_ident, quote};
6+
7+
use crate::parser_definition::VersionQualityRangeVecExtensions;
8+
use crate::scanner_definition::ScannerDefinitionNodeExtensions;
9+
10+
pub trait KeywordScannerDefinitionExtensions {
11+
fn to_scanner_code(&self) -> TokenStream;
12+
}
13+
14+
impl KeywordScannerDefinitionExtensions for KeywordScannerDefinitionRef {
15+
fn to_scanner_code(&self) -> TokenStream {
16+
let name_ident = format_ident!("{}", self.name());
17+
let token_kind = quote! { TokenKind::#name_ident };
18+
19+
let kw_scanners: Vec<_> = self
20+
.definitions()
21+
.iter()
22+
.map(|versioned_kw| {
23+
let scanner = versioned_kw.value.to_scanner_code();
24+
let enabled_cond = versioned_kw.enabled.as_bool_expr();
25+
let reserved_cond = versioned_kw.reserved.as_bool_expr();
26+
27+
// Simplify the emitted code if we trivially know that reserved or enabled is true
28+
match (&*reserved_cond.to_string(), &*enabled_cond.to_string()) {
29+
("true", _) => quote! {
30+
if #scanner {
31+
KeywordScan::Reserved(#token_kind)
32+
} else {
33+
KeywordScan::Absent
34+
}
35+
},
36+
("false", _) => quote! {
37+
if #enabled_cond && #scanner {
38+
KeywordScan::Present(#token_kind)
39+
} else {
40+
KeywordScan::Absent
41+
}
42+
},
43+
(_, "true") => quote! {
44+
if #scanner {
45+
if #reserved_cond {
46+
KeywordScan::Reserved(#token_kind)
47+
} else {
48+
KeywordScan::Present(#token_kind)
49+
}
50+
} else {
51+
KeywordScan::Absent
52+
}
53+
},
54+
(_, "false") => quote! {
55+
if #reserved_cond && #scanner {
56+
KeywordScan::Reserved(#token_kind)
57+
} else {
58+
KeywordScan::Absent
59+
}
60+
},
61+
_ => quote! {
62+
if (#reserved_cond || #enabled_cond) && #scanner {
63+
if #reserved_cond {
64+
KeywordScan::Reserved(#token_kind)
65+
} else {
66+
KeywordScan::Present(#token_kind)
67+
}
68+
} else {
69+
KeywordScan::Absent
70+
}
71+
},
72+
}
73+
})
74+
.collect();
75+
76+
match &kw_scanners[..] {
77+
[] => quote! { KeywordScan::Absent },
78+
multiple => quote! { scan_keyword_choice!(input, ident, #(#multiple),*) },
79+
}
80+
}
81+
}
82+
83+
impl KeywordScannerDefinitionExtensions for KeywordScannerDefinitionNode {
84+
fn to_scanner_code(&self) -> TokenStream {
85+
// This is a subset; let's reuse that
86+
ScannerDefinitionNode::from(self.clone()).to_scanner_code()
87+
}
88+
}

crates/codegen/parser/generator/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
mod ast_model;
2+
mod keyword_scanner_definition;
23
mod parser_definition;
34
mod precedence_parser_definition;
45
mod rust_generator;

crates/codegen/parser/generator/src/parser_definition.rs

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use codegen_grammar::{
55
use inflector::Inflector;
66
use proc_macro2::TokenStream;
77
use quote::{format_ident, quote};
8+
use semver::Version;
89

910
pub trait ParserDefinitionExtensions {
1011
fn to_parser_code(&self) -> TokenStream;
@@ -138,6 +139,21 @@ impl ParserDefinitionNodeExtensions for ParserDefinitionNode {
138139
}
139140
}
140141

142+
// Keyword scanner uses the promotion inside the parse_token
143+
Self::KeywordScannerDefinition(scanner_definition) => {
144+
let kind = format_ident!("{name}", name = scanner_definition.name());
145+
146+
let parse_token = if is_trivia {
147+
format_ident!("parse_token")
148+
} else {
149+
format_ident!("parse_token_with_trivia")
150+
};
151+
152+
quote! {
153+
self.#parse_token::<#lex_ctx>(input, TokenKind::#kind)
154+
}
155+
}
156+
141157
Self::TriviaParserDefinition(trivia_parser_definition) => {
142158
let function_name =
143159
format_ident!("{}", trivia_parser_definition.name().to_snake_case());
@@ -299,13 +315,24 @@ impl ParserDefinitionNodeExtensions for ParserDefinitionNode {
299315

300316
pub trait VersionQualityRangeVecExtensions {
301317
fn wrap_code(&self, if_true: TokenStream, if_false: Option<TokenStream>) -> TokenStream;
318+
// Quotes a boolean expression that is satisfied for the given version quality ranges
319+
fn as_bool_expr(&self) -> TokenStream;
302320
}
303321

304322
impl VersionQualityRangeVecExtensions for Vec<VersionQualityRange> {
305-
fn wrap_code(&self, if_true: TokenStream, if_false: Option<TokenStream>) -> TokenStream {
323+
fn as_bool_expr(&self) -> TokenStream {
306324
if self.is_empty() {
307-
if_true
325+
quote!(true)
308326
} else {
327+
// Optimize for legibility; return `false` for "never enabled"
328+
match self.as_slice() {
329+
[VersionQualityRange {
330+
from,
331+
quality: VersionQuality::Removed,
332+
}] if from == &Version::new(0, 0, 0) => return quote!(false),
333+
_ => {}
334+
}
335+
309336
let flags = self.iter().map(|vqr| {
310337
let flag = format_ident!(
311338
"version_is_at_least_{v}",
@@ -317,8 +344,18 @@ impl VersionQualityRangeVecExtensions for Vec<VersionQualityRange> {
317344
quote! { !self.#flag }
318345
}
319346
});
347+
quote! { #(#flags)&&* }
348+
}
349+
}
350+
351+
fn wrap_code(&self, if_true: TokenStream, if_false: Option<TokenStream>) -> TokenStream {
352+
if self.is_empty() {
353+
if_true
354+
} else {
355+
let condition = self.as_bool_expr();
356+
320357
let else_part = if_false.map(|if_false| quote! { else { #if_false } });
321-
quote! { if #(#flags)&&* { #if_true } #else_part }
358+
quote! { if #condition { #if_true } #else_part }
322359
}
323360
}
324361
}

0 commit comments

Comments
 (0)