Skip to content

Commit 810464a

Browse files
authored
Update Java lexer (#873)
1 parent 77e9146 commit 810464a

File tree

5 files changed

+284
-180
lines changed

5 files changed

+284
-180
lines changed

lexers/embedded/java.xml

Lines changed: 130 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -5,116 +5,188 @@
55
<filename>*.java</filename>
66
<mime_type>text/x-java</mime_type>
77
<dot_all>true</dot_all>
8-
<ensure_nl>true</ensure_nl>
98
</config>
109
<rules>
11-
<state name="class">
12-
<rule pattern="([^\W\d]|\$)[\w$]*">
13-
<token type="NameClass"/>
14-
<pop depth="1"/>
15-
</rule>
16-
</state>
17-
<state name="import">
18-
<rule pattern="[\w.]+\*?">
19-
<token type="NameNamespace"/>
20-
<pop depth="1"/>
21-
</rule>
22-
</state>
2310
<state name="root">
11+
<rule pattern="(^\s*)((?:(?:public|private|protected|static|strictfp)(?:\s+))*)(record)\b">
12+
<bygroups>
13+
<token type="TextWhitespace" />
14+
<usingself state="root" />
15+
<token type="KeywordDeclaration" />
16+
</bygroups>
17+
<push state="class" />
18+
</rule>
2419
<rule pattern="[^\S\n]+">
25-
<token type="Text"/>
20+
<token type="TextWhitespace" />
2621
</rule>
27-
<rule pattern="//.*?\n">
28-
<token type="CommentSingle"/>
22+
<rule pattern="(//.*?)(\n)">
23+
<bygroups>
24+
<token type="CommentSingle" />
25+
<token type="TextWhitespace" />
26+
</bygroups>
2927
</rule>
3028
<rule pattern="/\*.*?\*/">
31-
<token type="CommentMultiline"/>
29+
<token type="CommentMultiline" />
3230
</rule>
33-
<rule pattern="(assert|break|case|catch|continue|default|do|else|finally|for|if|goto|instanceof|new|return|switch|this|throw|try|while)\b">
34-
<token type="Keyword"/>
31+
<rule
32+
pattern="(assert|break|case|catch|continue|default|do|else|finally|for|if|goto|instanceof|new|return|switch|this|throw|try|while)\b">
33+
<token type="Keyword" />
3534
</rule>
3635
<rule pattern="((?:(?:[^\W\d]|\$)[\w.\[\]$&lt;&gt;]*\s+)+?)((?:[^\W\d]|\$)[\w$]*)(\s*)(\()">
3736
<bygroups>
38-
<usingself state="root"/>
39-
<token type="NameFunction"/>
40-
<token type="Text"/>
41-
<token type="Operator"/>
37+
<usingself state="root" />
38+
<token type="NameFunction" />
39+
<token type="TextWhitespace" />
40+
<token type="Punctuation" />
4241
</bygroups>
4342
</rule>
4443
<rule pattern="@[^\W\d][\w.]*">
45-
<token type="NameDecorator"/>
44+
<token type="NameDecorator" />
4645
</rule>
47-
<rule pattern="(abstract|const|enum|extends|final|implements|native|private|protected|public|static|strictfp|super|synchronized|throws|transient|volatile)\b">
48-
<token type="KeywordDeclaration"/>
46+
<rule
47+
pattern="(abstract|const|enum|extends|final|implements|native|private|protected|public|sealed|static|strictfp|super|synchronized|throws|transient|volatile|yield)\b">
48+
<token type="KeywordDeclaration" />
4949
</rule>
5050
<rule pattern="(boolean|byte|char|double|float|int|long|short|void)\b">
51-
<token type="KeywordType"/>
51+
<token type="KeywordType" />
5252
</rule>
5353
<rule pattern="(package)(\s+)">
5454
<bygroups>
55-
<token type="KeywordNamespace"/>
56-
<token type="Text"/>
55+
<token type="KeywordNamespace" />
56+
<token type="TextWhitespace" />
5757
</bygroups>
58-
<push state="import"/>
58+
<push state="import" />
5959
</rule>
6060
<rule pattern="(true|false|null)\b">
61-
<token type="KeywordConstant"/>
61+
<token type="KeywordConstant" />
62+
</rule>
63+
<rule pattern="(class|interface)\b">
64+
<token type="KeywordDeclaration" />
65+
<push state="class" />
6266
</rule>
63-
<rule pattern="(class|interface)(\s+)">
67+
<rule pattern="(var)(\s+)">
6468
<bygroups>
65-
<token type="KeywordDeclaration"/>
66-
<token type="Text"/>
69+
<token type="KeywordDeclaration" />
70+
<token type="TextWhitespace" />
6771
</bygroups>
68-
<push state="class"/>
72+
<push state="var" />
6973
</rule>
7074
<rule pattern="(import(?:\s+static)?)(\s+)">
7175
<bygroups>
72-
<token type="KeywordNamespace"/>
73-
<token type="Text"/>
76+
<token type="KeywordNamespace" />
77+
<token type="TextWhitespace" />
7478
</bygroups>
75-
<push state="import"/>
79+
<push state="import" />
7680
</rule>
77-
<rule pattern="&#34;(\\\\|\\&#34;|[^&#34;])*&#34;">
78-
<token type="LiteralString"/>
81+
<rule pattern="&quot;&quot;&quot;\n">
82+
<token type="LiteralString" />
83+
<push state="multiline_string" />
7984
</rule>
80-
<rule pattern="&#39;\\.&#39;|&#39;[^\\]&#39;|&#39;\\u[0-9a-fA-F]{4}&#39;">
81-
<token type="LiteralStringChar"/>
85+
<rule pattern="&quot;">
86+
<token type="LiteralString" />
87+
<push state="string" />
8288
</rule>
83-
<rule pattern="\d+[LlUu]*">
84-
<token type="LiteralNumberInteger"/>
89+
<rule pattern="&#x27;\\.&#x27;|&#x27;[^\\]&#x27;|&#x27;\\u[0-9a-fA-F]{4}&#x27;">
90+
<token type="LiteralStringChar" />
8591
</rule>
8692
<rule pattern="(\.)((?:[^\W\d]|\$)[\w$]*)">
8793
<bygroups>
88-
<token type="Operator"/>
89-
<token type="NameAttribute"/>
94+
<token type="Punctuation" />
95+
<token type="NameAttribute" />
96+
</bygroups>
97+
</rule>
98+
<rule pattern="^(\s*)(default)(:)">
99+
<bygroups>
100+
<token type="TextWhitespace" />
101+
<token type="Keyword" />
102+
<token type="Punctuation" />
90103
</bygroups>
91104
</rule>
92-
<rule pattern="^\s*([^\W\d]|\$)[\w$]*:">
93-
<token type="NameLabel"/>
105+
<rule pattern="^(\s*)((?:[^\W\d]|\$)[\w$]*)(:)">
106+
<bygroups>
107+
<token type="TextWhitespace" />
108+
<token type="NameLabel" />
109+
<token type="Punctuation" />
110+
</bygroups>
94111
</rule>
95112
<rule pattern="([^\W\d]|\$)[\w$]*">
96-
<token type="Name"/>
113+
<token type="Name" />
97114
</rule>
98-
<rule pattern="([0-9][0-9_]*\.([0-9][0-9_]*)?|\.[0-9][0-9_]*)([eE][+\-]?[0-9][0-9_]*)?[fFdD]?|[0-9][eE][+\-]?[0-9][0-9_]*[fFdD]?|[0-9]([eE][+\-]?[0-9][0-9_]*)?[fFdD]|0[xX]([0-9a-fA-F][0-9a-fA-F_]*\.?|([0-9a-fA-F][0-9a-fA-F_]*)?\.[0-9a-fA-F][0-9a-fA-F_]*)[pP][+\-]?[0-9][0-9_]*[fFdD]?">
99-
<token type="LiteralNumberFloat"/>
115+
<rule
116+
pattern="([0-9][0-9_]*\.([0-9][0-9_]*)?|\.[0-9][0-9_]*)([eE][+\-]?[0-9][0-9_]*)?[fFdD]?|[0-9][eE][+\-]?[0-9][0-9_]*[fFdD]?|[0-9]([eE][+\-]?[0-9][0-9_]*)?[fFdD]|0[xX]([0-9a-fA-F][0-9a-fA-F_]*\.?|([0-9a-fA-F][0-9a-fA-F_]*)?\.[0-9a-fA-F][0-9a-fA-F_]*)[pP][+\-]?[0-9][0-9_]*[fFdD]?">
117+
<token type="LiteralNumberFloat" />
100118
</rule>
101119
<rule pattern="0[xX][0-9a-fA-F][0-9a-fA-F_]*[lL]?">
102-
<token type="LiteralNumberHex"/>
120+
<token type="LiteralNumberHex" />
103121
</rule>
104122
<rule pattern="0[bB][01][01_]*[lL]?">
105-
<token type="LiteralNumberBin"/>
123+
<token type="LiteralNumberBin" />
106124
</rule>
107125
<rule pattern="0[0-7_]+[lL]?">
108-
<token type="LiteralNumberOct"/>
126+
<token type="LiteralNumberOct" />
109127
</rule>
110128
<rule pattern="0|[1-9][0-9_]*[lL]?">
111-
<token type="LiteralNumberInteger"/>
129+
<token type="LiteralNumberInteger" />
112130
</rule>
113-
<rule pattern="[~^*!%&amp;\[\](){}&lt;&gt;|+=:;,./?-]">
114-
<token type="Operator"/>
131+
<rule pattern="[~^*!%&amp;\[\]&lt;&gt;|+=/?-]">
132+
<token type="Operator" />
133+
</rule>
134+
<rule pattern="[{}();:.,]">
135+
<token type="Punctuation" />
115136
</rule>
116137
<rule pattern="\n">
117-
<token type="Text"/>
138+
<token type="TextWhitespace" />
139+
</rule>
140+
</state>
141+
<state name="class">
142+
<rule pattern="\s+">
143+
<token type="Text" />
144+
</rule>
145+
<rule pattern="([^\W\d]|\$)[\w$]*">
146+
<token type="NameClass" />
147+
<pop depth="1" />
148+
</rule>
149+
</state>
150+
<state name="var">
151+
<rule pattern="([^\W\d]|\$)[\w$]*">
152+
<token type="Name" />
153+
<pop depth="1" />
154+
</rule>
155+
</state>
156+
<state name="import">
157+
<rule pattern="[\w.]+\*?">
158+
<token type="NameNamespace" />
159+
<pop depth="1" />
160+
</rule>
161+
</state>
162+
<state name="multiline_string">
163+
<rule pattern="&quot;&quot;&quot;">
164+
<token type="LiteralString" />
165+
<pop depth="1" />
166+
</rule>
167+
<rule pattern="&quot;">
168+
<token type="LiteralString" />
169+
</rule>
170+
<rule>
171+
<include state="string" />
172+
</rule>
173+
</state>
174+
<state name="string">
175+
<rule pattern="[^\\&quot;]+">
176+
<token type="LiteralString" />
177+
</rule>
178+
<rule pattern="\\\\">
179+
<token type="LiteralString" />
180+
</rule>
181+
<rule pattern="\\&quot;">
182+
<token type="LiteralString" />
183+
</rule>
184+
<rule pattern="\\">
185+
<token type="LiteralString" />
186+
</rule>
187+
<rule pattern="&quot;">
188+
<token type="LiteralString" />
189+
<pop depth="1" />
118190
</rule>
119191
</state>
120192
</rules>

lexers/testdata/cql.expected

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,19 +1066,19 @@
10661066
{"type":"TextWhitespace","value":"\n"},
10671067
{"type":"LiteralStringHeredoc","value":"'"},
10681068
{"type":"Keyword","value":"return"},
1069-
{"type":"Text","value":" "},
1069+
{"type":"TextWhitespace","value":" "},
10701070
{"type":"Name","value":"Double"},
1071-
{"type":"Operator","value":"."},
1071+
{"type":"Punctuation","value":"."},
10721072
{"type":"NameAttribute","value":"valueOf"},
1073-
{"type":"Operator","value":"("},
1073+
{"type":"Punctuation","value":"("},
10741074
{"type":"Name","value":"Math"},
1075-
{"type":"Operator","value":"."},
1075+
{"type":"Punctuation","value":"."},
10761076
{"type":"NameAttribute","value":"log"},
1077-
{"type":"Operator","value":"("},
1077+
{"type":"Punctuation","value":"("},
10781078
{"type":"Name","value":"input"},
1079-
{"type":"Operator","value":"."},
1079+
{"type":"Punctuation","value":"."},
10801080
{"type":"NameAttribute","value":"doubleValue"},
1081-
{"type":"Operator","value":"()));"},
1081+
{"type":"Punctuation","value":"()));"},
10821082
{"type":"LiteralStringHeredoc","value":"'"},
10831083
{"type":"Punctuation","value":";"},
10841084
{"type":"TextWhitespace","value":"\n"}

lexers/testdata/java.actual

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,13 @@ final class TargetUnsafeRefArrayAccess {
2323
@Alias
2424
@RecomputeFieldValue(kind = RecomputeFieldValue.Kind.ArrayIndexShift, declClass = Object[].class)
2525
public static int REF_ELEMENT_SHIFT;
26+
27+
public static void test() {
28+
System.out.println("""
29+
Hello, world!
30+
This is a multi-line string!
31+
It can also contain "quotes" and 'apostrophes' without breaking.
32+
We only need to escape \""" inside it.
33+
""");
34+
}
2635
}

0 commit comments

Comments
 (0)