In one of my experiments, training and validation operations on both TPUv2(single core) and TPUv3(single core) devices finished successfully for the 1st chunk of the first epoch.
At the start of the 2nd chunk, I got the error below:
Device is TPU:0
2023-12-03 18:43:08.088299: F ./torch_xla/csrc/runtime/debug_macros.h:20] Non-OK-status: status.status() status: INVALID_ARGUMENT: Transpose dimensions [0,1,-2,-1] are not a permutation of the operand dimensions (operand shape is f32[9,9]).
*** Begin stack trace ***
tsl::CurrentStackTrace()
xla::Shape const* ConsumeValue<xla::Shape const*>(absl::lts_20230125::StatusOr<xla::Shape const*>&&)
torch_xla::ShapeHelper::ShapeOfXlaOp(xla::XlaOp)
torch_xla::BuildDiagonalViewUpdate(xla::XlaOp, xla::XlaOp, long, long, long)
torch_xla::DiagonalViewUpdate::Lower(torch_xla::LoweringContext*) const
torch_xla::LoweringContext::LowerNode(torch::lazy::Node const*)
torch_xla::LoweringContext::LoweringContext(std::string const&, torch::lazy::BackendDevice, c10::ArrayRef<torch::lazy::Node const*>, std::unordered_map<torch::lazy::Node const*, torch::lazy::Util::EmitStatus, std::hash<torch::lazy::Node const*>, std::equal_to<torch::lazy::Node const*>, std::allocator<std::pair<torch::lazy::Node const* const, torch::lazy::Util::EmitStatus> > >)
torch_xla::XLAGraphExecutor::Compile(std::vector<c10::intrusive_ptr<torch_xla::XLATensor, c10::detail::intrusive_target_default_null_type<torch_xla::XLATensor> >, std::allocator<c10::intrusive_ptr<torch_xla::XLATensor, c10::detail::intrusive_target_default_null_type<torch_xla::XLATensor> > > > const&, absl::lts_20230125::Span<std::string const>, torch::lazy::LazyGraphExecutor::SyncTensorCollection const&, torch::lazy::LazyGraphExecutor::PostOrderData*, std::vector<torch::lazy::Value, std::allocator<torch::lazy::Value> > const&)
torch_xla::XLAGraphExecutor::SyncTensorsGraphInternal(std::vector<c10::intrusive_ptr<torch_xla::XLATensor, c10::detail::intrusive_target_default_null_type<torch_xla::XLATensor> >, std::allocator<c10::intrusive_ptr<torch_xla::XLATensor, c10::detail::intrusive_target_default_null_type<torch_xla::XLATensor> > > >*, absl::lts_20230125::Span<std::string const>, torch::lazy::LazyGraphExecutor::SyncTensorsConfig const&, bool)
torch_xla::XLAGraphExecutor::SyncTensorsGraph(std::vector<c10::intrusive_ptr<torch_xla::XLATensor, c10::detail::intrusive_target_default_null_type<torch_xla::XLATensor> >, std::allocator<c10::intrusive_ptr<torch_xla::XLATensor, c10::detail::intrusive_target_default_null_type<torch_xla::XLATensor> > > >*, absl::lts_20230125::Span<std::string const>, bool, bool, bool)
torch_xla::XLAGraphExecutor::SyncLiveTensorsGraph(torch::lazy::BackendDevice const*, c10::ArrayRef<std::string>, bool)
PyCFunction_Call
_PyObject_MakeTpCall
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyFunction_Vectorcall
_PyEval_EvalFrameDefault
_PyEval_EvalCodeWithName
PyEval_EvalCode
PyRun_SimpleFileExFlags
Py_RunMain
Py_BytesMain
__libc_start_main
_start
*** End stack trace ***
https://symbolize.stripped_domain/r/?trace=7fe863d8b00b,7fe863d8b08f,7fe70b836517,7fe70b83657d,7fe70b4eb93f,7fe70b792fdc,7fe70b82c5ec,7fe70b82cade,7fe70b66a634,7fe70b66c6b8,7fe70b66ccfa,7fe70b66d127,7fe70b43c929,7fe70b43cd65,7fe70b41bf1f,5d5498,8fdaff&map=06b7eaee513554b0b69f7d4d65fa69f6858d5374:7fe706c02000-7fe715441e40
*** SIGABRT received by PID 660093 (TID 660093) on cpu 24 from PID 660093; stack trace: ***
PC: @ 0x7fe863d8b00b (unknown) raise
@ 0x7fe705f9f53a 1152 (unknown)
@ 0x7fe863d8b090 (unknown) (unknown)
@ 0x7fe70b836518 432 ConsumeValue<>()
@ 0x7fe70b83657e 64 torch_xla::ShapeHelper::ShapeOfXlaOp()
@ 0x7fe70b4eb940 672 torch_xla::BuildDiagonalViewUpdate()
@ 0x7fe70b792fdd 80 torch_xla::DiagonalViewUpdate::Lower()
@ 0x7fe70b82c5ed 112 torch_xla::LoweringContext::LowerNode()
@ 0x7fe70b82cadf 224 torch_xla::LoweringContext::LoweringContext()
@ 0x7fe70b66a635 4192 torch_xla::XLAGraphExecutor::Compile()
@ 0x7fe70b66c6b9 1008 torch_xla::XLAGraphExecutor::SyncTensorsGraphInternal()
@ 0x7fe70b66ccfb 560 torch_xla::XLAGraphExecutor::SyncTensorsGraph()
@ 0x7fe70b66d128 1072 torch_xla::XLAGraphExecutor::SyncLiveTensorsGraph()
@ 0x7fe70b43c92a 720 torch_xla::(anonymous namespace)::StepMarker()
@ 0x7fe70b43cd66 128 pybind11::cpp_function::initialize<>()::{lambda()#3}::_FUN()
@ 0x7fe70b41bf20 528 pybind11::cpp_function::dispatcher()
@ 0x5d5499 (unknown) PyCFunction_Call
@ 0x8fdb00 (unknown) (unknown)
https://symbolize.stripped_domain/r/?trace=7fe863d8b00b,7fe705f9f539,7fe863d8b08f,7fe70b836517,7fe70b83657d,7fe70b4eb93f,7fe70b792fdc,7fe70b82c5ec,7fe70b82cade,7fe70b66a634,7fe70b66c6b8,7fe70b66ccfa,7fe70b66d127,7fe70b43c929,7fe70b43cd65,7fe70b41bf1f,5d5498,8fdaff&map=06b7eaee513554b0b69f7d4d65fa69f6858d5374:7fe706c02000-7fe715441e40,abbd016d9542b8098892badc0b19ea68:7fe6f8df5000-7fe7061b3cf0
E1203 18:43:08.325543 660093 coredump_hook.cc:447] RAW: Remote crash data gathering hook invoked.
E1203 18:43:08.325562 660093 coredump_hook.cc:486] RAW: Skipping coredump since rlimit was 0 at process start.
E1203 18:43:08.325577 660093 client.cc:272] RAW: Coroner client retries enabled (b/136286901), will retry for up to 30 sec.
E1203 18:43:08.325588 660093 coredump_hook.cc:542] RAW: Sending fingerprint to remote end.
E1203 18:43:08.325612 660093 coredump_hook.cc:551] RAW: Cannot send fingerprint to Coroner: [NOT_FOUND] stat failed on crash reporting socket /var/google/services/logmanagerd/remote_coredump.socket (Is the listener running?): No such file or directory
E1203 18:43:08.325626 660093 coredump_hook.cc:603] RAW: Dumping core locally.
E1203 18:43:08.702843 660093 process_state.cc:783] RAW: Raising signal 6 with default behavior
Dataloader does not do anything different for the 2nd chunk of the 1st epoch.
I appreciate any help.
Hello
In one of my experiments, training and validation operations on both TPUv2(single core) and TPUv3(single core) devices finished successfully for the 1st chunk of the first epoch.
At the start of the 2nd chunk, I got the error below:
Dataloader does not do anything different for the 2nd chunk of the 1st epoch.
I appreciate any help.
best regards