-
-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Closed
Labels
Description
If we have a Pandas dataframe with a string dtype (not object) then our quantiles code doesn't know what to do. We should either call astype(object) (good short term fix) or figure out a nicer way of how to handle this.
import dask.dataframe as dd, pandas as pd
df = pd.DataFrame({"x": range(5), "y": "abcde"})
df["y"] = df.y.astype("string")
ddf = dd.from_pandas(df, npartitions=1)
ddf.set_index("y")TypeError Traceback (most recent call last)
<ipython-input-6-b96d10f4863b> in <module>
----> 1 ddf.set_index("y")
~/workspace/dask/dask/dataframe/core.py in set_index(***failed resolving arguments***)
4226 from .shuffle import set_index
4227
-> 4228 return set_index(
4229 self,
4230 other,
~/workspace/dask/dask/dataframe/shuffle.py in set_index(df, index, npartitions, shuffle, compute, drop, upsample, divisions, partition_size, **kwargs)
160
161 if divisions is None:
--> 162 divisions, mins, maxes = _calculate_divisions(
163 df, index2, repartition, npartitions, upsample, partition_size
164 )
~/workspace/dask/dask/dataframe/shuffle.py in _calculate_divisions(df, partition_col, repartition, npartitions, upsample, partition_size)
33 mins = partition_col.map_partitions(M.min)
34 maxes = partition_col.map_partitions(M.max)
---> 35 divisions, sizes, mins, maxes = base.compute(divisions, sizes, mins, maxes)
36 divisions = methods.tolist(divisions)
37 if type(sizes) is not list:
~/workspace/dask/dask/base.py in compute(*args, **kwargs)
566 postcomputes.append(x.__dask_postcompute__())
567
--> 568 results = schedule(dsk, keys, **kwargs)
569 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
570
~/workspace/dask/dask/threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs)
77 pool = MultiprocessingPoolExecutor(pool)
78
---> 79 results = get_async(
80 pool.submit,
81 pool._max_workers,
~/workspace/dask/dask/local.py in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
512 _execute_task(task, data) # Re-execute locally
513 else:
--> 514 raise_exception(exc, tb)
515 res, worker_id = loads(res_info)
516 state["cache"][key] = res
~/workspace/dask/dask/local.py in reraise(exc, tb)
323 if exc.__traceback__ is not tb:
324 raise exc.with_traceback(tb)
--> 325 raise exc
326
327
~/workspace/dask/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
221 try:
222 task, data = loads(task_info)
--> 223 result = _execute_task(task, data)
224 id = get_id()
225 result = dumps((result, id))
~/workspace/dask/dask/core.py in _execute_task(arg, cache, dsk)
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
~/workspace/dask/dask/dataframe/partitionquantiles.py in percentiles_summary(df, num_old, num_new, upsample, state)
414 data = data.codes
415 interpolation = "nearest"
--> 416 elif np.issubdtype(data.dtype, np.integer) and not is_cupy_type(data):
417 # CuPy doesn't currently support "nearest" interpolation,
418 # so it's special cased in the condition above.
~/mambaforge/lib/python3.9/site-packages/numpy/core/numerictypes.py in issubdtype(arg1, arg2)
416 """
417 if not issubclass_(arg1, generic):
--> 418 arg1 = dtype(arg1).type
419 if not issubclass_(arg2, generic):
420 arg2 = dtype(arg2).type
TypeError: Cannot interpret 'string[python]' as a data typecc @TomAugspurger for advice
Reactions are currently unavailable