-
-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Closed
Description
import pandas as pd
import dask.dataframe as dd
import numpy as np
path = "test.parquet"
# make an example parquet file
pd.DataFrame(columns=["a", "b", "c"], data=np.random.uniform(size=(10, 3))).to_parquet(path)
# read it with dask and rename columns
ddf = dd.read_parquet(path)
ddf.columns = ["d", "e", "f"]
ddf.compute()This throws:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-6-1cce38752e55> in <module>
9 ddf = dd.read_parquet(path)
10 ddf.columns = ["d", "e", "f"]
---> 11 ddf.compute()
~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/base.py in compute(self, **kwargs)
277 dask.base.compute
278 """
--> 279 (result,) = compute(self, traverse=False, **kwargs)
280 return result
281
~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
559 )
560
--> 561 dsk = collections_to_dsk(collections, optimize_graph, **kwargs)
562 keys, postcomputes = [], []
563 for x in collections:
~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/base.py in collections_to_dsk(collections, optimize_graph, **kwargs)
330 dsk, keys = _extract_graph_and_keys(val)
331 groups[opt] = (dsk, keys)
--> 332 _opt = opt(dsk, keys, **kwargs)
333 _opt_list.append(_opt)
334
~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/dataframe/optimize.py in optimize(dsk, keys, **kwargs)
26 return dsk
27
---> 28 dependencies = dsk.get_all_dependencies()
29 dsk = ensure_dict(dsk)
30
~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/highlevelgraph.py in get_all_dependencies(self)
530 A map that maps each key to its dependencies
531 """
--> 532 all_keys = self.keyset()
533 missing_keys = all_keys.difference(self.key_dependencies.keys())
534 if missing_keys:
~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/highlevelgraph.py in keyset(self)
499 self._keys = set()
500 for layer in self.layers.values():
--> 501 self._keys.update(layer.keys())
502 return self._keys
503
~/.pyenv/versions/3.8.6/lib/python3.8/_collections_abc.py in __iter__(self)
718
719 def __iter__(self):
--> 720 yield from self._mapping
721
722 KeysView.register(dict_keys)
~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/blockwise.py in __iter__(self)
291
292 def __iter__(self):
--> 293 return iter(self._dict)
294
295 def __len__(self):
~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/dask/blockwise.py in _dict(self)
593 for k in dsk:
594 io_key = (self.io_name,) + tuple([k[i] for i in range(1, len(k))])
--> 595 if io_key in dsk[k]:
596 # Inject IO-function arguments into the blockwise graph
597 # as a single (packed) tuple.
~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/pandas/core/ops/common.py in new_method(self, other)
63 other = item_from_zerodim(other)
64
---> 65 return method(self, other)
66
67 return new_method
~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/pandas/core/arraylike.py in __eq__(self, other)
27 @unpack_zerodim_and_defer("__eq__")
28 def __eq__(self, other):
---> 29 return self._cmp_method(other, operator.eq)
30
31 @unpack_zerodim_and_defer("__ne__")
~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/pandas/core/frame.py in _cmp_method(self, other, op)
5963 axis = 1 # only relevant for Series other case
5964
-> 5965 self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None)
5966
5967 # See GH#4537 for discussion of scalar op behavior
~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/pandas/core/ops/__init__.py in align_method_FRAME(left, right, axis, flex, level)
260 )
261 # GH17901
--> 262 right = to_series(right)
263
264 if flex is not None and isinstance(right, ABCDataFrame):
~/.pyenv/versions/3.8.6/envs/ezra/lib/python3.8/site-packages/pandas/core/ops/__init__.py in to_series(right)
217 else:
218 if len(left.columns) != len(right):
--> 219 raise ValueError(
220 msg.format(req_len=len(left.columns), given_len=len(right))
221 )
ValueError: Unable to coerce to Series, length must be 3: given 2
If I call a .persist or .repartition after reading the parquet, it works without an problems.
- Dask version: 2020.12.0
- Python version: 3.8.6
- Operating System: MacOS Big Sur 11.1, Ubuntu 18
- Install method (conda, pip, source): pip with pyenv
Reactions are currently unavailable