Skip to content

Commit 05ccf9f

Browse files
committed
perf(linter/plugins): transfer tokens via raw transfer (#19893)
Transfer tokens via raw transfer in Oxlint JS plugins. Deserializer is written by hand. `Token` is not a normal struct, so deserializer cannot be generated.
1 parent c2a42f6 commit 05ccf9f

File tree

8 files changed

+165
-80
lines changed

8 files changed

+165
-80
lines changed

apps/oxlint/src-js/plugins/source_code.ts

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ import type { BufferWithArrays, Comment, Node } from "./types.ts";
2929
import type { ScopeManager } from "./scope.ts";
3030

3131
// Text decoder, for decoding source text from buffer
32-
export const textDecoder = new TextDecoder("utf-8", { ignoreBOM: true });
32+
const textDecoder = new TextDecoder("utf-8", { ignoreBOM: true });
3333

3434
// Buffer containing AST. Set before linting a file by `setupSourceForFile`.
3535
export let buffer: BufferWithArrays | null = null;
@@ -245,10 +245,7 @@ export const SOURCE_CODE = Object.freeze({
245245
// This property is present in ESLint's `SourceCode`, but is undocumented
246246
get tokensAndComments(): (Token | Comment)[] {
247247
if (tokensAndComments === null) {
248-
if (tokens === null) {
249-
if (sourceText === null) initSourceText();
250-
initTokens();
251-
}
248+
if (tokens === null) initTokens();
252249
initTokensAndComments();
253250
}
254251
debugAssertIsNonNull(tokensAndComments);

apps/oxlint/src-js/plugins/tokens.ts

Lines changed: 111 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22
* `SourceCode` methods related to tokens.
33
*/
44

5-
import { ast, initAst } from "./source_code.ts";
6-
import { buffer, textDecoder } from "./source_code.ts";
5+
import { ast, buffer, initAst, initSourceText, sourceText } from "./source_code.ts";
76
import { getNodeLoc } from "./location.ts";
87
import { TOKENS_OFFSET_POS_32, TOKENS_LEN_POS_32 } from "../generated/constants.ts";
98
import { debugAssert, debugAssertIsNonNull } from "../utils/asserts.ts";
@@ -108,8 +107,8 @@ export interface PunctuatorToken extends BaseToken {
108107
export interface RegularExpressionToken extends BaseToken {
109108
type: "RegularExpression";
110109
regex: {
111-
flags: string;
112110
pattern: string;
111+
flags: string;
113112
};
114113
}
115114

@@ -138,51 +137,133 @@ const TokenProto = Object.create(Object.prototype, {
138137
},
139138
});
140139

141-
// Tokens for the current file parsed by TS-ESLint.
140+
// Tokens for the current file.
142141
// Created lazily only when needed.
143142
export let tokens: Token[] | null = null;
144143
let comments: Comment[] | null = null;
145144
export let tokensAndComments: TokenOrComment[] | null = null;
146145

146+
let uint32: Uint32Array | null = null;
147+
148+
// `ESTreeKind` discriminants (set by Rust side)
149+
const PRIVATE_IDENTIFIER_KIND = 2;
150+
const REGEXP_KIND = 8;
151+
152+
// Indexed by `ESTreeKind` discriminant (matches `ESTreeKind` enum in `estree_kind.rs`)
153+
const TOKEN_TYPES: Token["type"][] = [
154+
"Identifier",
155+
"Keyword",
156+
"PrivateIdentifier",
157+
"Punctuator",
158+
"Numeric",
159+
"String",
160+
"Boolean",
161+
"Null",
162+
"RegularExpression",
163+
"Template",
164+
"JSXText",
165+
"JSXIdentifier",
166+
];
167+
168+
// Details of Rust `Token` type
169+
const TOKEN_SIZE = 16;
170+
const KIND_FIELD_OFFSET = 8;
171+
const IS_ESCAPED_FIELD_OFFSET = 10;
172+
147173
/**
148-
* Initialize TS-ESLint tokens for current file.
174+
* Initialize tokens for current file.
149175
*/
150176
export function initTokens() {
151177
debugAssert(tokens === null, "Tokens already initialized");
152178

153-
// Get tokens JSON from buffer, and deserialize it
154-
debugAssertIsNonNull(buffer);
155-
156-
const { uint32 } = buffer;
157-
const tokensJsonLen = uint32[TOKENS_LEN_POS_32];
158-
if (tokensJsonLen === 0) {
159-
tokens = [];
160-
return;
161-
}
179+
// Deserialize tokens from buffer
180+
if (sourceText === null) initSourceText();
181+
debugAssertIsNonNull(sourceText);
162182

163-
const tokensJsonOffset = uint32[TOKENS_OFFSET_POS_32];
164-
const tokensJson = textDecoder.decode(
165-
buffer.subarray(tokensJsonOffset, tokensJsonOffset + tokensJsonLen),
166-
);
167-
tokens = JSON.parse(tokensJson) as Token[];
183+
debugAssertIsNonNull(buffer);
184+
uint32 = buffer.uint32;
168185

169-
// Add `range` property to each token, and set prototype of each to `TokenProto` which provides getter for `loc`
170-
for (const token of tokens) {
171-
const { start, end } = token;
172-
debugAssert(
173-
typeof start === "number" && typeof end === "number",
174-
"Precomputed tokens should include `start` and `end`",
175-
);
186+
let pos = uint32[TOKENS_OFFSET_POS_32];
187+
const len = uint32[TOKENS_LEN_POS_32];
188+
const endPos = pos + len * TOKEN_SIZE;
176189

177-
token.range = [start, end];
178-
// `TokenProto` provides getter for `loc`
179-
Object.setPrototypeOf(token, TokenProto);
190+
tokens = [];
191+
while (pos < endPos) {
192+
tokens.push(deserializeToken(pos));
193+
pos += TOKEN_SIZE;
180194
}
181195

196+
uint32 = null;
197+
182198
// Check `tokens` have valid ranges and are in ascending order
183199
debugCheckValidRanges(tokens, "token");
184200
}
185201

202+
/**
203+
* Deserialize a token from buffer at position `pos`.
204+
* @param pos - Position in buffer containing Rust `Token` type
205+
* @returns `Token` object
206+
*/
207+
function deserializeToken(pos: number): Token {
208+
const pos32 = pos >> 2;
209+
const start = uint32![pos32],
210+
end = uint32![pos32 + 1];
211+
212+
let value = sourceText!.slice(start, end);
213+
214+
const kind = buffer![pos + KIND_FIELD_OFFSET];
215+
216+
if (kind === REGEXP_KIND) {
217+
const patternEnd = value.lastIndexOf("/");
218+
return {
219+
// @ts-expect-error - TS doesn't understand `__proto__`
220+
__proto__: TokenProto,
221+
type: "RegularExpression",
222+
value,
223+
regex: {
224+
pattern: value.slice(1, patternEnd),
225+
flags: value.slice(patternEnd + 1),
226+
},
227+
start,
228+
end,
229+
range: [start, end],
230+
};
231+
}
232+
233+
// Strip leading `#` from private identifiers
234+
if (kind === PRIVATE_IDENTIFIER_KIND) value = value.slice(1);
235+
236+
// Unescape identifiers, keywords, and private identifiers
237+
if (kind <= PRIVATE_IDENTIFIER_KIND && buffer![pos + IS_ESCAPED_FIELD_OFFSET] === 1) {
238+
value = unescapeIdentifier(value);
239+
}
240+
241+
return {
242+
// @ts-expect-error - TS doesn't understand `__proto__`
243+
__proto__: TokenProto,
244+
type: TOKEN_TYPES[kind],
245+
value,
246+
start,
247+
end,
248+
range: [start, end],
249+
};
250+
}
251+
252+
/**
253+
* Unescape an identifier.
254+
*
255+
* We do this on JS side, because escaped identifiers are so extremely rare that this function
256+
* is never called in practice anyway.
257+
*
258+
* @param {string} name - Identifier name to unescape
259+
* @returns {string} - Unescaped identifier name
260+
*/
261+
function unescapeIdentifier(name: string): string {
262+
return name.replace(/\\u(?:\{([0-9a-fA-F]+)\}|([0-9a-fA-F]{4}))/g, (_, hex1, hex2) =>
263+
String.fromCodePoint(parseInt(hex1 ?? hex2, 16)),
264+
);
265+
}
266+
186267
/**
187268
* Check `tokens` have valid ranges and are in ascending order.
188269
*
@@ -332,7 +413,7 @@ function debugCheckTokensAndComments() {
332413
}
333414

334415
/**
335-
* Discard TS-ESLint tokens to free memory.
416+
* Discard tokens to free memory.
336417
*/
337418
export function resetTokens() {
338419
tokens = null;

apps/oxlint/src-js/utils/globals.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,5 @@ export const { parse: JSONParse, stringify: JSONStringify } = JSON;
3232
export const { ownKeys: ReflectOwnKeys } = Reflect;
3333

3434
export const { iterator: SymbolIterator } = Symbol;
35+
36+
export const { fromCodePoint: StringFromCodePoint } = String;

apps/oxlint/src/js_plugins/parse.rs

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use napi_derive::napi;
88

99
use oxc_allocator::Allocator;
1010
use oxc_ast_visit::utf8_to_utf16::Utf8ToUtf16;
11-
use oxc_estree_tokens::{ESTreeTokenOptionsJS, to_estree_tokens_json};
11+
use oxc_estree_tokens::{ESTreeTokenOptionsJS, update_tokens};
1212
use oxc_linter::RawTransferMetadata2 as RawTransferMetadata;
1313
use oxc_napi::get_source_type;
1414
use oxc_parser::{ParseOptions, Parser, ParserReturn, config::RuntimeParserConfig};
@@ -177,7 +177,7 @@ unsafe fn parse_raw_impl(
177177
})
178178
.with_config(RuntimeParserConfig::new(true))
179179
.parse();
180-
let ParserReturn { program: parsed_program, errors, tokens, panicked, .. } = parser_ret;
180+
let ParserReturn { program: parsed_program, errors, mut tokens, panicked, .. } = parser_ret;
181181
let program = allocator.alloc(parsed_program);
182182

183183
let mut parsing_failed = panicked || (!errors.is_empty() && !ignore_non_fatal_errors);
@@ -214,21 +214,14 @@ unsafe fn parse_raw_impl(
214214
Utf8ToUtf16::new(source_text)
215215
};
216216

217-
let tokens_json = to_estree_tokens_json(
218-
&tokens,
219-
program,
220-
original_source_text,
221-
&span_converter,
222-
ESTreeTokenOptionsJS,
223-
);
217+
update_tokens(&mut tokens, program, &span_converter, ESTreeTokenOptionsJS);
224218

225219
span_converter.convert_program(program);
226220
span_converter.convert_comments(&mut program.comments);
227221

228-
let tokens_json = allocator.alloc_str(&tokens_json);
229-
let tokens_offset = tokens_json.as_ptr() as u32;
222+
let tokens_offset = tokens.as_ptr() as u32;
230223
#[expect(clippy::cast_possible_truncation)]
231-
let tokens_len = tokens_json.len() as u32;
224+
let tokens_len = tokens.len() as u32;
232225

233226
// Return offset of `Program` within buffer (bottom 32 bits of pointer)
234227
let program_offset = ptr::from_ref(program) as u32;

crates/oxc_linter/src/context/host.rs

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,11 +109,6 @@ impl<'a> ContextSubHost<'a> {
109109
pub fn framework_options(&self) -> FrameworkOptions {
110110
self.framework_options
111111
}
112-
113-
/// Parser tokens collected for this script block.
114-
pub fn parser_tokens(&self) -> Option<&[Token]> {
115-
self.parser_tokens.as_ref().map(|tokens| &tokens[..])
116-
}
117112
}
118113

119114
/// Stores shared information about a file being linted.
@@ -239,6 +234,16 @@ impl<'a> ContextHost<'a> {
239234
&self.current_sub_host().disable_directives
240235
}
241236

237+
/// Shared reference to the parser tokens collected for this script block.
238+
pub fn parser_tokens(&self) -> Option<&[Token]> {
239+
self.current_sub_host().parser_tokens.as_ref().map(|tokens| &tokens[..])
240+
}
241+
242+
/// Mutable reference to the parser tokens collected for this script block.
243+
pub fn parser_tokens_mut(&mut self) -> Option<&mut ArenaVec<'a, Token>> {
244+
self.current_sub_host_mut().parser_tokens.as_mut()
245+
}
246+
242247
/// Path to the file being linted.
243248
///
244249
/// When created from a [`LintService`](`crate::service::LintService`), this

crates/oxc_linter/src/lib.rs

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,14 @@ use std::{
1414
string::ToString,
1515
};
1616

17-
use oxc_allocator::{Allocator, AllocatorPool, CloneIn};
17+
use oxc_allocator::{Allocator, AllocatorPool, CloneIn, TakeIn};
1818
use oxc_ast::{ast::Program, ast_kind::AST_TYPE_MAX};
1919
use oxc_ast_macros::ast;
2020
use oxc_ast_visit::utf8_to_utf16::Utf8ToUtf16;
2121
use oxc_data_structures::box_macros::boxed_array;
2222
use oxc_diagnostics::OxcDiagnostic;
23-
use oxc_estree_tokens::{ESTreeTokenOptionsJS, to_estree_tokens_json};
23+
use oxc_estree_tokens::{ESTreeTokenOptionsJS, update_tokens};
24+
use oxc_parser::Token;
2425
use oxc_semantic::AstNode;
2526
use oxc_span::Span;
2627

@@ -474,7 +475,18 @@ impl Linter {
474475
}
475476

476477
// `allocator` is a fixed-size allocator, so no need to clone AST into a new one
477-
self.convert_and_call_external_linter(external_rules, path, ctx_host, program, allocator);
478+
let tokens = ctx_host
479+
.parser_tokens_mut()
480+
.map(|tokens| tokens.take_in(allocator).into_bump_slice_mut());
481+
482+
self.convert_and_call_external_linter(
483+
external_rules,
484+
path,
485+
ctx_host,
486+
program,
487+
tokens,
488+
allocator,
489+
);
478490
}
479491

480492
#[cfg(not(all(target_pointer_width = "64", target_endian = "little")))]
@@ -524,11 +536,15 @@ impl Linter {
524536
js_allocator.alloc(program)
525537
};
526538

539+
// Clone tokens into fixed-size allocator
540+
let tokens = ctx_host.parser_tokens().map(|tokens| js_allocator.alloc_slice_copy(tokens));
541+
527542
self.convert_and_call_external_linter(
528543
external_rules,
529544
path,
530545
ctx_host,
531546
program,
547+
tokens,
532548
&js_allocator,
533549
);
534550

@@ -546,6 +562,7 @@ impl Linter {
546562
path: &Path,
547563
ctx_host: &ContextHost<'_>,
548564
program: &mut Program<'_>,
565+
tokens: Option<&mut [Token]>,
549566
allocator: &Allocator,
550567
) {
551568
// If has BOM, remove it
@@ -569,23 +586,15 @@ impl Linter {
569586
Utf8ToUtf16::new(source_text)
570587
};
571588

572-
let (tokens_offset, tokens_len) =
573-
if let Some(tokens) = ctx_host.current_sub_host().parser_tokens() {
574-
let tokens_json = to_estree_tokens_json(
575-
tokens,
576-
program,
577-
original_source_text,
578-
&span_converter,
579-
ESTreeTokenOptionsJS,
580-
);
581-
let tokens_json = allocator.alloc_str(&tokens_json);
582-
let tokens_offset = tokens_json.as_ptr() as u32;
583-
#[expect(clippy::cast_possible_truncation)]
584-
let tokens_len = tokens_json.len() as u32;
585-
(tokens_offset, tokens_len)
586-
} else {
587-
(0, 0)
588-
};
589+
let (tokens_offset, tokens_len) = if let Some(tokens) = tokens {
590+
update_tokens(tokens, program, &span_converter, ESTreeTokenOptionsJS);
591+
let tokens_offset = tokens.as_ptr() as u32;
592+
#[expect(clippy::cast_possible_truncation)]
593+
let tokens_len = tokens.len() as u32;
594+
(tokens_offset, tokens_len)
595+
} else {
596+
(0, 0)
597+
};
589598

590599
span_converter.convert_program(program);
591600
span_converter.convert_comments(&mut program.comments);
@@ -752,9 +761,9 @@ pub struct RawTransferMetadata2 {
752761
pub is_jsx: bool,
753762
/// `true` if source text has a BOM.
754763
pub has_bom: bool,
755-
/// Offset of serialized ESTree tokens JSON within buffer.
764+
/// Offset of lexer `Token`s within buffer.
756765
pub tokens_offset: u32,
757-
/// UTF-8 byte length of serialized ESTree tokens JSON.
766+
/// Number of lexer `Token`s.
758767
pub tokens_len: u32,
759768
}
760769

0 commit comments

Comments
 (0)