Skip to content

Commit 8a69c06

Browse files
crisbetothePunderWoman
authored andcommitted
refactor(compiler): tokenize regular expression literals (#63857)
Updates the expression lexer to produce tokens for regular expression literals. PR Close #63857
1 parent 89cf62f commit 8a69c06

File tree

2 files changed

+244
-1
lines changed

2 files changed

+244
-1
lines changed

packages/compiler/src/expression_parser/lexer.ts

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ export enum TokenType {
1616
String,
1717
Operator,
1818
Number,
19+
RegExpBody,
20+
RegExpFlags,
1921
Error,
2022
}
2123

@@ -128,6 +130,14 @@ export class Token {
128130
return this.type === TokenType.Error;
129131
}
130132

133+
isRegExpBody(): boolean {
134+
return this.type === TokenType.RegExpBody;
135+
}
136+
137+
isRegExpFlags(): boolean {
138+
return this.type === TokenType.RegExpFlags;
139+
}
140+
131141
toNumber(): number {
132142
return this.type === TokenType.Number ? this.numValue : -1;
133143
}
@@ -159,6 +169,8 @@ export class Token {
159169
case TokenType.PrivateIdentifier:
160170
case TokenType.String:
161171
case TokenType.Error:
172+
case TokenType.RegExpBody:
173+
case TokenType.RegExpFlags:
162174
return this.strValue;
163175
case TokenType.Number:
164176
return this.numValue.toString();
@@ -207,6 +219,14 @@ function newErrorToken(index: number, end: number, message: string): Token {
207219
return new Token(index, end, TokenType.Error, 0, message);
208220
}
209221

222+
function newRegExpBodyToken(index: number, end: number, text: string): Token {
223+
return new Token(index, end, TokenType.RegExpBody, 0, text);
224+
}
225+
226+
function newRegExpFlagsToken(index: number, end: number, text: string): Token {
227+
return new Token(index, end, TokenType.RegExpFlags, 0, text);
228+
}
229+
210230
export const EOF: Token = new Token(-1, -1, TokenType.Character, 0, '');
211231

212232
class _Scanner {
@@ -300,6 +320,9 @@ class _Scanner {
300320
case chars.$MINUS:
301321
return this.scanComplexOperator(start, '-', chars.$EQ, '=');
302322
case chars.$SLASH:
323+
if (this.isStartOfRegex()) {
324+
return this.scanRegex(index);
325+
}
303326
return this.scanComplexOperator(start, '/', chars.$EQ, '=');
304327
case chars.$PERCENT:
305328
return this.scanComplexOperator(start, '%', chars.$EQ, '=');
@@ -606,6 +629,78 @@ class _Scanner {
606629

607630
return newOperatorToken(start, this.index, operator);
608631
}
632+
633+
private isStartOfRegex(): boolean {
634+
if (this.tokens.length === 0) {
635+
return true;
636+
}
637+
638+
const lastToken = this.tokens[this.tokens.length - 1];
639+
640+
return (
641+
!lastToken.isIdentifier() &&
642+
!lastToken.isPrivateIdentifier() &&
643+
!lastToken.isNumber() &&
644+
!lastToken.isString() &&
645+
!lastToken.isKeyword() &&
646+
!lastToken.isCharacter(chars.$RPAREN) &&
647+
!lastToken.isCharacter(chars.$RBRACKET)
648+
);
649+
}
650+
651+
private scanRegex(tokenStart: number): Token {
652+
this.advance();
653+
const textStart = this.index;
654+
let inEscape = false;
655+
let inCharacterClass = false;
656+
657+
while (true) {
658+
const peek = this.peek;
659+
660+
if (peek === chars.$EOF) {
661+
return this.error('Unterminated regular expression', 0);
662+
}
663+
664+
if (inEscape) {
665+
inEscape = false;
666+
} else if (peek === chars.$BACKSLASH) {
667+
inEscape = true;
668+
} else if (peek === chars.$LBRACKET) {
669+
inCharacterClass = true;
670+
} else if (peek === chars.$RBRACKET) {
671+
inCharacterClass = false;
672+
} else if (peek === chars.$SLASH && !inCharacterClass) {
673+
break;
674+
}
675+
this.advance();
676+
}
677+
678+
// Note that we want the text without the slashes,
679+
// but we still want the slashes to be part of the span.
680+
const value = this.input.substring(textStart, this.index);
681+
this.advance();
682+
const bodyToken = newRegExpBodyToken(tokenStart, this.index, value);
683+
const flagsToken = this.scanRegexFlags(this.index);
684+
685+
if (flagsToken !== null) {
686+
this.tokens.push(bodyToken);
687+
return flagsToken;
688+
}
689+
690+
return bodyToken;
691+
}
692+
693+
private scanRegexFlags(start: number): Token | null {
694+
if (!chars.isAsciiLetter(this.peek)) {
695+
return null;
696+
}
697+
698+
while (chars.isAsciiLetter(this.peek)) {
699+
this.advance();
700+
}
701+
702+
return newRegExpFlagsToken(start, this.index, this.input.substring(start, this.index));
703+
}
609704
}
610705

611706
function isIdentifierStart(code: number): boolean {

packages/compiler/test/expression_parser/lexer_spec.ts

Lines changed: 149 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,18 @@ function expectErrorToken(token: Token, index: any, end: number, message: string
7272
expect(token.toString()).toEqual(message);
7373
}
7474

75+
function expectRegExpBodyToken(token: any, index: number, end: number, str: string) {
76+
expectToken(token, index, end);
77+
expect(token.isRegExpBody()).toBe(true);
78+
expect(token.toString()).toEqual(str);
79+
}
80+
81+
function expectRegExpFlagsToken(token: any, index: number, end: number, str: string) {
82+
expectToken(token, index, end);
83+
expect(token.isRegExpFlags()).toBe(true);
84+
expect(token.toString()).toEqual(str);
85+
}
86+
7587
describe('lexer', () => {
7688
describe('token', () => {
7789
it('should tokenize a simple identifier', () => {
@@ -410,7 +422,7 @@ describe('lexer', () => {
410422
expectOperatorToken(lex('+=')[0], 0, 2, '+=');
411423
expectOperatorToken(lex('-=')[0], 0, 2, '-=');
412424
expectOperatorToken(lex('*=')[0], 0, 2, '*=');
413-
expectOperatorToken(lex('/=')[0], 0, 2, '/=');
425+
expectOperatorToken(lex('a /= b')[1], 2, 4, '/=');
414426
expectOperatorToken(lex('%=')[0], 0, 2, '%=');
415427
expectOperatorToken(lex('**=')[0], 0, 3, '**=');
416428
expectOperatorToken(lex('&&=')[0], 0, 3, '&&=');
@@ -673,5 +685,141 @@ describe('lexer', () => {
673685
expectStringToken(tokens[6], 23, 24, '', StringTokenKind.TemplateLiteralEnd);
674686
});
675687
});
688+
689+
describe('regular expressions', () => {
690+
it('should tokenize a simple regex', () => {
691+
const tokens: Token[] = lex('/abc/');
692+
expect(tokens.length).toBe(1);
693+
expectRegExpBodyToken(tokens[0], 0, 5, 'abc');
694+
});
695+
696+
it('should tokenize a regex with flags', () => {
697+
const tokens: Token[] = lex('/abc/gim');
698+
expect(tokens.length).toBe(2);
699+
expectRegExpBodyToken(tokens[0], 0, 5, 'abc');
700+
expectRegExpFlagsToken(tokens[1], 5, 8, 'gim');
701+
});
702+
703+
it('should tokenize an identifier immediately after a regex', () => {
704+
const tokens: Token[] = lex('/abc/ g');
705+
expect(tokens.length).toBe(2);
706+
expectRegExpBodyToken(tokens[0], 0, 5, 'abc');
707+
expectIdentifierToken(tokens[1], 6, 7, 'g');
708+
});
709+
710+
it('should tokenize a regex with an escaped slashes', () => {
711+
const tokens: Token[] = lex('/^http:\\/\\/foo\\.bar/');
712+
expect(tokens.length).toBe(1);
713+
expectRegExpBodyToken(tokens[0], 0, 20, '^http:\\/\\/foo\\.bar');
714+
});
715+
716+
it('should tokenize a regex with un-escaped slashes in a character class', () => {
717+
const tokens: Token[] = lex('/[a/]$/');
718+
expect(tokens.length).toBe(1);
719+
expectRegExpBodyToken(tokens[0], 0, 7, '[a/]$');
720+
});
721+
722+
it('should tokenize a regex with a backslash', () => {
723+
const tokens: Token[] = lex('/a\\w+/');
724+
expect(tokens.length).toBe(1);
725+
expectRegExpBodyToken(tokens[0], 0, 6, 'a\\w+');
726+
});
727+
728+
it('should tokenize a method call on a regex', () => {
729+
const tokens: Token[] = lex('/abc/.test("foo")');
730+
expect(tokens.length).toBe(6);
731+
expectRegExpBodyToken(tokens[0], 0, 5, 'abc');
732+
expectCharacterToken(tokens[1], 5, 6, '.');
733+
expectIdentifierToken(tokens[2], 6, 10, 'test');
734+
expectCharacterToken(tokens[3], 10, 11, '(');
735+
expectStringToken(tokens[4], 11, 16, 'foo', StringTokenKind.Plain);
736+
expectCharacterToken(tokens[5], 16, 17, ')');
737+
});
738+
739+
it('should tokenize a method call with a regex parameter', () => {
740+
const tokens: Token[] = lex('"foo".match(/abc/)');
741+
expect(tokens.length).toBe(6);
742+
expectStringToken(tokens[0], 0, 5, 'foo', StringTokenKind.Plain);
743+
expectCharacterToken(tokens[1], 5, 6, '.');
744+
expectIdentifierToken(tokens[2], 6, 11, 'match');
745+
expectCharacterToken(tokens[3], 11, 12, '(');
746+
expectRegExpBodyToken(tokens[4], 12, 17, 'abc');
747+
expectCharacterToken(tokens[5], 17, 18, ')');
748+
});
749+
750+
it('should not tokenize a regex preceded by a square bracket', () => {
751+
const tokens: Token[] = lex('a[0] /= b');
752+
expect(tokens.length).toBe(6);
753+
expectIdentifierToken(tokens[0], 0, 1, 'a');
754+
expectCharacterToken(tokens[1], 1, 2, '[');
755+
expectNumberToken(tokens[2], 2, 3, 0);
756+
expectCharacterToken(tokens[3], 3, 4, ']');
757+
expectOperatorToken(tokens[4], 5, 7, '/=');
758+
expectIdentifierToken(tokens[5], 8, 9, 'b');
759+
});
760+
761+
it('should not tokenize a regex preceded by an identifier', () => {
762+
const tokens: Token[] = lex('a / b');
763+
expect(tokens.length).toBe(3);
764+
expectIdentifierToken(tokens[0], 0, 1, 'a');
765+
expectOperatorToken(tokens[1], 2, 3, '/');
766+
expectIdentifierToken(tokens[2], 4, 5, 'b');
767+
});
768+
769+
it('should not tokenize a regex preceded by a number', () => {
770+
const tokens: Token[] = lex('1 / b');
771+
expect(tokens.length).toBe(3);
772+
expectNumberToken(tokens[0], 0, 1, 1);
773+
expectOperatorToken(tokens[1], 2, 3, '/');
774+
expectIdentifierToken(tokens[2], 4, 5, 'b');
775+
});
776+
777+
it('should not tokenize a regex that is preceded by a string', () => {
778+
const tokens: Token[] = lex('"a" / b');
779+
expect(tokens.length).toBe(3);
780+
expectStringToken(tokens[0], 0, 3, 'a', StringTokenKind.Plain);
781+
expectOperatorToken(tokens[1], 4, 5, '/');
782+
expectIdentifierToken(tokens[2], 6, 7, 'b');
783+
});
784+
785+
it('should not tokenize a regex preceded by a closing parenthesis', () => {
786+
const tokens: Token[] = lex('(a) / b');
787+
expect(tokens.length).toBe(5);
788+
expectCharacterToken(tokens[0], 0, 1, '(');
789+
expectIdentifierToken(tokens[1], 1, 2, 'a');
790+
expectCharacterToken(tokens[2], 2, 3, ')');
791+
expectOperatorToken(tokens[3], 4, 5, '/');
792+
expectIdentifierToken(tokens[4], 6, 7, 'b');
793+
});
794+
795+
it('should not tokenize a regex that is preceded by a keyword', () => {
796+
const tokens: Token[] = lex('this / b');
797+
expect(tokens.length).toBe(3);
798+
expectKeywordToken(tokens[0], 0, 4, 'this');
799+
expectOperatorToken(tokens[1], 5, 6, '/');
800+
expectIdentifierToken(tokens[2], 7, 8, 'b');
801+
});
802+
803+
it('should produce an error for an unterminated regex', () => {
804+
expectErrorToken(
805+
lex('/a')[0],
806+
2,
807+
2,
808+
'Lexer Error: Unterminated regular expression at column 2 in expression [/a]',
809+
);
810+
});
811+
812+
it('should produce an error for a incorrectly-escaped regex', () => {
813+
const tokens = lex('/a\\\\//');
814+
expect(tokens.length).toBe(2);
815+
expectRegExpBodyToken(tokens[0], 0, 5, 'a\\\\');
816+
expectErrorToken(
817+
tokens[1],
818+
6,
819+
6,
820+
'Lexer Error: Unterminated regular expression at column 6 in expression [/a\\\\//]',
821+
);
822+
});
823+
});
676824
});
677825
});

0 commit comments

Comments
 (0)