Skip to content

Commit 68ad332

Browse files
authored
Remove Frame mutex and use DataStack bump allocator for LocalsPlus (#7333)
* Remove PyMutex<FrameState> from Frame, use UnsafeCell fields directly Move stack, cells_frees, prev_line out of the mutex-protected FrameState into Frame as FrameUnsafeCell fields. This eliminates mutex lock/unlock overhead on every frame execution (with_exec). Safety relies on the same single-threaded execution guarantee that FastLocals already uses. * Add thread-local DataStack for bump-allocating frame data Introduce DataStack with linked chunks (16KB initial, doubling) and push/pop bump allocation. Add datastack field to VirtualMachine. Not yet wired to frame creation. * Unify FastLocals and BoxVec stack into LocalsPlus Replace separate FastLocals (Box<[Option<PyObjectRef>]>) and BoxVec<Option<PyStackRef>> with a single LocalsPlus struct that stores both in a contiguous Box<[usize]> array. The first nlocalsplus slots are fastlocals and the rest is the evaluation stack. Typed access is provided through transmute-based methods. Remove BoxVec import from frame.rs. * Use DataStack for LocalsPlus in non-generator function calls Normal function calls now bump-allocate LocalsPlus data from the per-thread DataStack instead of a separate heap allocation. Generator/coroutine frames continue using heap allocation since they outlive the call. On frame exit, data is copied to the heap (materialize_to_heap) to preserve locals for tracebacks, then the DataStack is popped. VirtualMachine.datastack is wrapped in UnsafeCell for interior mutability (safe because frame allocation is single-threaded LIFO). * Fix clippy: import Layout from core::alloc instead of alloc::alloc * Fix vectorcall compatibility with LocalsPlus API Update vectorcall dispatch functions to use localsplus stack accessors instead of direct stack field access. Add stack_truncate method to LocalsPlus. Update vectorcall fast path in function.rs to use datastack and fastlocals_mut(). * Add datastack, nlocalsplus, ncells, tstate to cspell dictionary * Fix DataStack pop() for non-monotonic allocation addresses Check both bounds of the current chunk when determining if a pop base is in the current chunk. The previous check (base >= chunk_start) fails on Windows where newer chunks may be allocated at lower addresses than older ones. * Fix stale comments: release_datastack -> materialize_localsplus * Fix non-threading mode for parallel test execution Two fixes for Cell-based types used in static items under non-threading mode, which cause data races when Rust test runner uses parallel threads: 1. LazyLock: use std::sync::LazyLock when std is available instead of wrapping core::cell::LazyCell with a false `unsafe impl Sync`. The LazyCell wrapper is kept only for no-std (truly single-threaded). 2. gc_state: use static_cell! (thread-local in non-threading mode) instead of OnceLock, so each thread gets its own GcState with Cell-based PyRwLock/PyMutex that are not accessed concurrently. * Fix CallAllocAndEnterInit to use LocalsPlus stack API * Use checked arithmetic in LocalsPlus and DataStack allocators * Address code review: checked arithmetic, threading feature deps, Send gate - Use checked arithmetic for nlocalsplus in Frame::new - Add "std" to threading feature dependencies in rustpython-common - Gate GcState Send impl with #[cfg(feature = "threading")] * Clean up comments: remove redundant/stale remarks, fix CPython references
1 parent a98c646 commit 68ad332

File tree

12 files changed

+966
-237
lines changed

12 files changed

+966
-237
lines changed

.cspell.dict/cpython.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ CONVFUNC
4343
convparam
4444
copyslot
4545
cpucount
46+
datastack
4647
defaultdict
4748
denom
4849
deopt
@@ -118,13 +119,15 @@ mult
118119
multibytecodec
119120
nameobj
120121
nameop
122+
ncells
121123
nconsts
122124
newargs
123125
newfree
124126
NEWLOCALS
125127
newsemlockobject
126128
nfrees
127129
nkwargs
130+
nlocalsplus
128131
nkwelts
129132
Nondescriptor
130133
noninteger
@@ -192,6 +195,7 @@ testconsole
192195
ticketer
193196
tmptype
194197
tok_oldval
198+
tstate
195199
tvars
196200
typeobject
197201
typeparam

crates/common/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ license.workspace = true
1111
[features]
1212
default = ["std"]
1313
std = []
14-
threading = ["parking_lot"]
14+
threading = ["parking_lot", "std"]
1515
wasm_js = ["getrandom/wasm_js"]
1616

1717
[dependencies]

crates/common/src/lock.rs

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,28 @@ cfg_if::cfg_if! {
1010
if #[cfg(feature = "threading")] {
1111
pub use parking_lot::{RawMutex, RawRwLock, RawThreadId};
1212

13-
pub use std::sync::{LazyLock, OnceLock as OnceCell};
13+
pub use std::sync::OnceLock as OnceCell;
1414
pub use core::cell::LazyCell;
1515
} else {
1616
mod cell_lock;
1717
pub use cell_lock::{RawCellMutex as RawMutex, RawCellRwLock as RawRwLock, SingleThreadId as RawThreadId};
1818

1919
pub use core::cell::{LazyCell, OnceCell};
20+
}
21+
}
2022

21-
/// `core::cell::LazyCell` with `Sync` for use in `static` items.
22-
/// SAFETY: Without threading, there can be no concurrent access.
23+
// LazyLock: uses std::sync::LazyLock when std is available (even without
24+
// threading, because Rust test runner uses parallel threads).
25+
// Without std, uses a LazyCell wrapper (truly single-threaded only).
26+
cfg_if::cfg_if! {
27+
if #[cfg(any(feature = "threading", feature = "std"))] {
28+
pub use std::sync::LazyLock;
29+
} else {
2330
pub struct LazyLock<T, F = fn() -> T>(core::cell::LazyCell<T, F>);
24-
// SAFETY: Without threading, there can be no concurrent access.
31+
// SAFETY: This branch is only active when both "std" and "threading"
32+
// features are absent — i.e., truly single-threaded no_std environments
33+
// (e.g., embedded or bare-metal WASM). Without std, the Rust runtime
34+
// cannot spawn threads, so Sync is trivially satisfied.
2535
unsafe impl<T, F> Sync for LazyLock<T, F> {}
2636

2737
impl<T, F: FnOnce() -> T> LazyLock<T, F> {

crates/vm/src/builtins/frame.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -692,7 +692,7 @@ impl Py<Frame> {
692692
// Clear fastlocals
693693
// SAFETY: Frame is not executing (detached or stopped).
694694
{
695-
let fastlocals = unsafe { self.fastlocals.borrow_mut() };
695+
let fastlocals = unsafe { self.fastlocals_mut() };
696696
for slot in fastlocals.iter_mut() {
697697
*slot = None;
698698
}

crates/vm/src/builtins/function.rs

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ impl PyFunction {
236236
// https://github.com/python/cpython/blob/main/Python/ceval.c#L3681
237237

238238
// SAFETY: Frame was just created and not yet executing.
239-
let fastlocals = unsafe { frame.fastlocals.borrow_mut() };
239+
let fastlocals = unsafe { frame.fastlocals_mut() };
240240

241241
let mut args_iter = func_args.args.into_iter();
242242

@@ -562,6 +562,7 @@ impl Py<PyFunction> {
562562

563563
let is_gen = code.flags.contains(bytecode::CodeFlags::GENERATOR);
564564
let is_coro = code.flags.contains(bytecode::CodeFlags::COROUTINE);
565+
let use_datastack = !(is_gen || is_coro);
565566

566567
// Construct frame:
567568
let frame = Frame::new(
@@ -570,6 +571,7 @@ impl Py<PyFunction> {
570571
self.builtins.clone(),
571572
self.closure.as_ref().map_or(&[], |c| c.as_slice()),
572573
Some(self.to_owned().into()),
574+
use_datastack,
573575
vm,
574576
)
575577
.into_ref(&vm.ctx);
@@ -594,7 +596,16 @@ impl Py<PyFunction> {
594596
frame.set_generator(&obj);
595597
Ok(obj)
596598
}
597-
(false, false) => vm.run_frame(frame),
599+
(false, false) => {
600+
let result = vm.run_frame(frame.clone());
601+
// Release data stack memory after frame execution completes.
602+
unsafe {
603+
if let Some(base) = frame.materialize_localsplus() {
604+
vm.datastack_pop(base);
605+
}
606+
}
607+
result
608+
}
598609
}
599610
}
600611

@@ -665,28 +676,35 @@ impl Py<PyFunction> {
665676
self.builtins.clone(),
666677
self.closure.as_ref().map_or(&[], |c| c.as_slice()),
667678
Some(self.to_owned().into()),
679+
true, // Always use datastack (invoke_exact_args is never gen/coro)
668680
vm,
669681
)
670682
.into_ref(&vm.ctx);
671683

672684
// Move args directly into fastlocals (no clone/refcount needed)
673685
{
674-
let fastlocals = unsafe { frame.fastlocals.borrow_mut() };
686+
let fastlocals = unsafe { frame.fastlocals_mut() };
675687
for (slot, arg) in fastlocals.iter_mut().zip(args.drain(..)) {
676688
*slot = Some(arg);
677689
}
678690
}
679691

680692
// Handle cell2arg
681693
if let Some(cell2arg) = code.cell2arg.as_deref() {
682-
let fastlocals = unsafe { frame.fastlocals.borrow_mut() };
694+
let fastlocals = unsafe { frame.fastlocals_mut() };
683695
for (cell_idx, arg_idx) in cell2arg.iter().enumerate().filter(|(_, i)| **i != -1) {
684696
let x = fastlocals[*arg_idx as usize].take();
685697
frame.set_cell_contents(cell_idx, x);
686698
}
687699
}
688700

689-
vm.run_frame(frame)
701+
let result = vm.run_frame(frame.clone());
702+
unsafe {
703+
if let Some(base) = frame.materialize_localsplus() {
704+
vm.datastack_pop(base);
705+
}
706+
}
707+
result
690708
}
691709
}
692710

@@ -1291,26 +1309,33 @@ pub(crate) fn vectorcall_function(
12911309
zelf.builtins.clone(),
12921310
zelf.closure.as_ref().map_or(&[], |c| c.as_slice()),
12931311
Some(zelf.to_owned().into()),
1312+
true, // Always use datastack (is_simple excludes gen/coro)
12941313
vm,
12951314
)
12961315
.into_ref(&vm.ctx);
12971316

12981317
{
1299-
let fastlocals = unsafe { frame.fastlocals.borrow_mut() };
1318+
let fastlocals = unsafe { frame.fastlocals_mut() };
13001319
for (slot, arg) in fastlocals.iter_mut().zip(args.drain(..nargs)) {
13011320
*slot = Some(arg);
13021321
}
13031322
}
13041323

13051324
if let Some(cell2arg) = code.cell2arg.as_deref() {
1306-
let fastlocals = unsafe { frame.fastlocals.borrow_mut() };
1325+
let fastlocals = unsafe { frame.fastlocals_mut() };
13071326
for (cell_idx, arg_idx) in cell2arg.iter().enumerate().filter(|(_, i)| **i != -1) {
13081327
let x = fastlocals[*arg_idx as usize].take();
13091328
frame.set_cell_contents(cell_idx, x);
13101329
}
13111330
}
13121331

1313-
return vm.run_frame(frame);
1332+
let result = vm.run_frame(frame.clone());
1333+
unsafe {
1334+
if let Some(base) = frame.materialize_localsplus() {
1335+
vm.datastack_pop(base);
1336+
}
1337+
}
1338+
return result;
13141339
}
13151340

13161341
// SLOW PATH: construct FuncArgs from owned Vec and delegate to invoke()

crates/vm/src/builtins/super.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ impl Initializer for PySuper {
8686
return Err(vm.new_runtime_error("super(): no arguments"));
8787
}
8888
// SAFETY: Frame is current and not concurrently mutated.
89-
let obj = unsafe { frame.fastlocals.borrow() }[0]
89+
let obj = unsafe { frame.fastlocals() }[0]
9090
.clone()
9191
.or_else(|| {
9292
if let Some(cell2arg) = frame.code.cell2arg.as_deref() {

0 commit comments

Comments
 (0)