-
Notifications
You must be signed in to change notification settings - Fork 158
Unable to read nested types in PyIceberg #357
Copy link
Copy link
Closed
Description
Apache Iceberg version
None
Please describe the bug 🐞
Unsure if this is related to Iceberg-go, or PyIceberg, but when creating a table with complex types, I'm seeing some weird Arrow error:
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
Cell In[8], line 1
----> 1 tbl.scan().to_pandas()
File /usr/local/lib/python3.10/site-packages/pyiceberg/table/__init__.py:2047, in DataScan.to_pandas(self, **kwargs)
2046 def to_pandas(self, **kwargs: Any) -> pd.DataFrame:
-> 2047 return self.to_arrow().to_pandas(**kwargs)
File /usr/local/lib/python3.10/site-packages/pyiceberg/table/__init__.py:2017, in DataScan.to_arrow(self)
2014 def to_arrow(self) -> pa.Table:
2015 from pyiceberg.io.pyarrow import project_table
-> 2017 return project_table(
2018 self.plan_files(),
2019 self.table_metadata,
2020 self.io,
2021 self.row_filter,
2022 self.projection(),
2023 case_sensitive=self.case_sensitive,
2024 limit=self.limit,
2025 )
File /usr/local/lib/python3.10/site-packages/pyiceberg/io/pyarrow.py:1338, in project_table(tasks, table_metadata, io, row_filter, projected_schema, case_sensitive, limit)
1336 for future in concurrent.futures.as_completed(futures):
1337 completed_futures.add(future)
-> 1338 if table_result := future.result():
1339 total_row_count += len(table_result)
1340 # stop early if limit is satisfied
File /usr/local/lib/python3.10/concurrent/futures/_base.py:451, in Future.result(self, timeout)
449 raise CancelledError()
450 elif self._state == FINISHED:
--> 451 return self.__get_result()
453 self._condition.wait(timeout)
455 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
File /usr/local/lib/python3.10/concurrent/futures/_base.py:403, in Future.__get_result(self)
401 if self._exception:
402 try:
--> 403 raise self._exception
404 finally:
405 # Break a reference cycle with the exception in self._exception
406 self = None
File /usr/local/lib/python3.10/concurrent/futures/thread.py:58, in _WorkItem.run(self)
55 return
57 try:
---> 58 result = self.fn(*self.args, **self.kwargs)
59 except BaseException as exc:
60 self.future.set_exception(exc)
File /usr/local/lib/python3.10/site-packages/pyiceberg/io/pyarrow.py:1240, in _task_to_table(fs, task, bound_row_filter, projected_schema, projected_field_ids, positional_deletes, case_sensitive, name_mapping)
1230 def _task_to_table(
1231 fs: FileSystem,
1232 task: FileScanTask,
(...)
1238 name_mapping: Optional[NameMapping] = None,
1239 ) -> Optional[pa.Table]:
-> 1240 batches = list(
1241 _task_to_record_batches(
1242 fs, task, bound_row_filter, projected_schema, projected_field_ids, positional_deletes, case_sensitive, name_mapping
1243 )
1244 )
1246 if len(batches) > 0:
1247 return pa.Table.from_batches(batches)
File /usr/local/lib/python3.10/site-packages/pyiceberg/io/pyarrow.py:1210, in _task_to_record_batches(fs, task, bound_row_filter, projected_schema, projected_field_ids, positional_deletes, case_sensitive, name_mapping)
1208 next_index = 0
1209 batches = fragment_scanner.to_batches()
-> 1210 for batch in batches:
1211 next_index = next_index + len(batch)
1212 current_index = next_index - len(batch)
File /usr/local/lib/python3.10/site-packages/pyarrow/_dataset.pyx:3769, in _iterator()
File /usr/local/lib/python3.10/site-packages/pyarrow/_dataset.pyx:3387, in pyarrow._dataset.TaggedRecordBatchIterator.__next__()
File /usr/local/lib/python3.10/site-packages/pyarrow/error.pxi:155, in pyarrow.lib.pyarrow_internal_check_status()
File /usr/local/lib/python3.10/site-packages/pyarrow/error.pxi:92, in pyarrow.lib.check_status()
OSError: Malformed levels. min: 2 max: 2 out of range. Max Level: 1
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels