Skip to content

Unable to read nested types in PyIceberg #357

@Fokko

Description

@Fokko

Apache Iceberg version

None

Please describe the bug 🐞

Unsure if this is related to Iceberg-go, or PyIceberg, but when creating a table with complex types, I'm seeing some weird Arrow error:

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
Cell In[8], line 1
----> 1 tbl.scan().to_pandas()

File /usr/local/lib/python3.10/site-packages/pyiceberg/table/__init__.py:2047, in DataScan.to_pandas(self, **kwargs)
   2046 def to_pandas(self, **kwargs: Any) -> pd.DataFrame:
-> 2047     return self.to_arrow().to_pandas(**kwargs)

File /usr/local/lib/python3.10/site-packages/pyiceberg/table/__init__.py:2017, in DataScan.to_arrow(self)
   2014 def to_arrow(self) -> pa.Table:
   2015     from pyiceberg.io.pyarrow import project_table
-> 2017     return project_table(
   2018         self.plan_files(),
   2019         self.table_metadata,
   2020         self.io,
   2021         self.row_filter,
   2022         self.projection(),
   2023         case_sensitive=self.case_sensitive,
   2024         limit=self.limit,
   2025     )

File /usr/local/lib/python3.10/site-packages/pyiceberg/io/pyarrow.py:1338, in project_table(tasks, table_metadata, io, row_filter, projected_schema, case_sensitive, limit)
   1336 for future in concurrent.futures.as_completed(futures):
   1337     completed_futures.add(future)
-> 1338     if table_result := future.result():
   1339         total_row_count += len(table_result)
   1340     # stop early if limit is satisfied

File /usr/local/lib/python3.10/concurrent/futures/_base.py:451, in Future.result(self, timeout)
    449     raise CancelledError()
    450 elif self._state == FINISHED:
--> 451     return self.__get_result()
    453 self._condition.wait(timeout)
    455 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:

File /usr/local/lib/python3.10/concurrent/futures/_base.py:403, in Future.__get_result(self)
    401 if self._exception:
    402     try:
--> 403         raise self._exception
    404     finally:
    405         # Break a reference cycle with the exception in self._exception
    406         self = None

File /usr/local/lib/python3.10/concurrent/futures/thread.py:58, in _WorkItem.run(self)
     55     return
     57 try:
---> 58     result = self.fn(*self.args, **self.kwargs)
     59 except BaseException as exc:
     60     self.future.set_exception(exc)

File /usr/local/lib/python3.10/site-packages/pyiceberg/io/pyarrow.py:1240, in _task_to_table(fs, task, bound_row_filter, projected_schema, projected_field_ids, positional_deletes, case_sensitive, name_mapping)
   1230 def _task_to_table(
   1231     fs: FileSystem,
   1232     task: FileScanTask,
   (...)
   1238     name_mapping: Optional[NameMapping] = None,
   1239 ) -> Optional[pa.Table]:
-> 1240     batches = list(
   1241         _task_to_record_batches(
   1242             fs, task, bound_row_filter, projected_schema, projected_field_ids, positional_deletes, case_sensitive, name_mapping
   1243         )
   1244     )
   1246     if len(batches) > 0:
   1247         return pa.Table.from_batches(batches)

File /usr/local/lib/python3.10/site-packages/pyiceberg/io/pyarrow.py:1210, in _task_to_record_batches(fs, task, bound_row_filter, projected_schema, projected_field_ids, positional_deletes, case_sensitive, name_mapping)
   1208 next_index = 0
   1209 batches = fragment_scanner.to_batches()
-> 1210 for batch in batches:
   1211     next_index = next_index + len(batch)
   1212     current_index = next_index - len(batch)

File /usr/local/lib/python3.10/site-packages/pyarrow/_dataset.pyx:3769, in _iterator()

File /usr/local/lib/python3.10/site-packages/pyarrow/_dataset.pyx:3387, in pyarrow._dataset.TaggedRecordBatchIterator.__next__()

File /usr/local/lib/python3.10/site-packages/pyarrow/error.pxi:155, in pyarrow.lib.pyarrow_internal_check_status()

File /usr/local/lib/python3.10/site-packages/pyarrow/error.pxi:92, in pyarrow.lib.check_status()

OSError: Malformed levels. min: 2 max: 2 out of range.  Max Level: 1

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions