Skip to content

Rename columns fails after read_parquet with ValueError: Unable to coerce to Series #7017

@rubenvdg

Description

@rubenvdg
import pandas as pd
import dask.dataframe as dd
import numpy as np

path = "test.parquet"

# make an example parquet file
pd.DataFrame(columns=["a", "b", "c"], data=np.random.uniform(size=(10, 3))).to_parquet(path)

# read it with dask and rename columns
ddf = dd.read_parquet(path)
ddf.columns = ["d", "e", "f"]
ddf.compute()

This throws:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-6-1cce38752e55> in <module>
      9 ddf = dd.read_parquet(path)
     10 ddf.columns = ["d", "e", "f"]
---> 11 ddf.compute()

~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/base.py in compute(self, **kwargs)
    277         dask.base.compute
    278         """
--> 279         (result,) = compute(self, traverse=False, **kwargs)
    280         return result
    281 

~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
    559     )
    560 
--> 561     dsk = collections_to_dsk(collections, optimize_graph, **kwargs)
    562     keys, postcomputes = [], []
    563     for x in collections:

~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/base.py in collections_to_dsk(collections, optimize_graph, **kwargs)
    330             dsk, keys = _extract_graph_and_keys(val)
    331             groups[opt] = (dsk, keys)
--> 332             _opt = opt(dsk, keys, **kwargs)
    333             _opt_list.append(_opt)
    334 

~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/dataframe/optimize.py in optimize(dsk, keys, **kwargs)
     26         return dsk
     27 
---> 28     dependencies = dsk.get_all_dependencies()
     29     dsk = ensure_dict(dsk)
     30 

~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/highlevelgraph.py in get_all_dependencies(self)
    530             A map that maps each key to its dependencies
    531         """
--> 532         all_keys = self.keyset()
    533         missing_keys = all_keys.difference(self.key_dependencies.keys())
    534         if missing_keys:

~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/highlevelgraph.py in keyset(self)
    499             self._keys = set()
    500             for layer in self.layers.values():
--> 501                 self._keys.update(layer.keys())
    502         return self._keys
    503 

~/.pyenv/versions/3.8.6/lib/python3.8/_collections_abc.py in __iter__(self)
    718 
    719     def __iter__(self):
--> 720         yield from self._mapping
    721 
    722 KeysView.register(dict_keys)

~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/blockwise.py in __iter__(self)
    291 
    292     def __iter__(self):
--> 293         return iter(self._dict)
    294 
    295     def __len__(self):

~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/blockwise.py in _dict(self)
    593             for k in dsk:
    594                 io_key = (self.io_name,) + tuple([k[i] for i in range(1, len(k))])
--> 595                 if io_key in dsk[k]:
    596                     # Inject IO-function arguments into the blockwise graph
    597                     # as a single (packed) tuple.

~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/pandas/core/ops/common.py in new_method(self, other)
     63         other = item_from_zerodim(other)
     64 
---> 65         return method(self, other)
     66 
     67     return new_method

~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/pandas/core/arraylike.py in __eq__(self, other)
     27     @unpack_zerodim_and_defer("__eq__")
     28     def __eq__(self, other):
---> 29         return self._cmp_method(other, operator.eq)
     30 
     31     @unpack_zerodim_and_defer("__ne__")

~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/pandas/core/frame.py in _cmp_method(self, other, op)
   5963         axis = 1  # only relevant for Series other case
   5964 
-> 5965         self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None)
   5966 
   5967         # See GH#4537 for discussion of scalar op behavior

~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/pandas/core/ops/__init__.py in align_method_FRAME(left, right, axis, flex, level)
    260             )
    261         # GH17901
--> 262         right = to_series(right)
    263 
    264     if flex is not None and isinstance(right, ABCDataFrame):

~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/pandas/core/ops/__init__.py in to_series(right)
    217         else:
    218             if len(left.columns) != len(right):
--> 219                 raise ValueError(
    220                     msg.format(req_len=len(left.columns), given_len=len(right))
    221                 )

ValueError: Unable to coerce to Series, length must be 3: given 2

If I call a .persist or .repartition after reading the parquet, it works without an problems.

  • Dask version: 2020.12.0
  • Python version: 3.8.6
  • Operating System: MacOS Big Sur 11.1, Ubuntu 18
  • Install method (conda, pip, source): pip with pyenv

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions