Raise exception for not implemented type of merge in dataframe #8138
Raise exception for not implemented type of merge in dataframe #8138ncclementi wants to merge 220 commits intodask:mainfrom
Conversation
jsignell
left a comment
There was a problem hiding this comment.
I have a minor suggestion and it would be great to include a test that raises this exception.
Co-authored-by: Julia Signell <jsignell@gmail.com>
dask/dataframe/tests/test_multi.py
Outdated
| } | ||
| ) | ||
|
|
||
| with pytest.raises(Exception): |
There was a problem hiding this comment.
nitpick, but you can use a specific class of Exception and match= to make sure that you are getting the error you expect.
There was a problem hiding this comment.
+1. Here's an example of another place in the codebase were we specify the specific type of error and message we expect to be raised:
dask/dask/array/tests/test_stats.py
Lines 146 to 147 in 1797c2b
There was a problem hiding this comment.
Thanks for the feedback and example. If the match sentence is too long I can cut it to just match="how='cross". I wasn't sure what do we prefer in this case.
dask/dataframe/multi.py
Outdated
|
|
||
| supported_how = ("left", "right", "outer", "inner") | ||
| if how not in supported_how: | ||
| raise Exception( |
There was a problem hiding this comment.
Let's raise a more specific error here instead of the Exception base class. A ValueError seems appropriate in this particular case. (Sorry I should have brought this up in my earlier comment)
There was a problem hiding this comment.
no problem, : ) just updated
jrbourbeau
left a comment
There was a problem hiding this comment.
It looks like this is causing legitimate GPU failures. For example
15:58:52 ____________ test_merge_tasks_semi_anti_cudf[cudf-leftsemi-parts1] _____________
15:58:52 [gw0] linux -- Python 3.8.10 /opt/conda/envs/dask/bin/python
15:58:52
15:58:52 engine = 'cudf', how = 'leftsemi', parts = (3, 1)
15:58:52
15:58:52 @pytest.mark.gpu
15:58:52 @pytest.mark.parametrize("parts", [(3, 3), (3, 1), (1, 3)])
15:58:52 @pytest.mark.parametrize("how", ["leftsemi", "leftanti"])
15:58:52 @pytest.mark.parametrize(
15:58:52 "engine",
15:58:52 [
15:58:52 "cudf",
15:58:52 pytest.param(
15:58:52 "pandas",
15:58:52 marks=pytest.mark.xfail(
15:58:52 reason="Pandas does not support leftsemi or leftanti"
15:58:52 ),
15:58:52 ),
15:58:52 ],
15:58:52 )
15:58:52 def test_merge_tasks_semi_anti_cudf(engine, how, parts):
15:58:52 if engine == "cudf":
15:58:52 # NOTE: engine == "cudf" requires cudf/dask_cudf,
15:58:52 # will be skipped by non-GPU CI.
15:58:52
15:58:52 cudf = pytest.importorskip("cudf")
15:58:52 dask_cudf = pytest.importorskip("dask_cudf")
15:58:52
15:58:52 emp = pd.DataFrame(
15:58:52 {
15:58:52 "emp_id": np.arange(101, stop=106),
15:58:52 "name": ["John", "Tom", "Harry", "Rahul", "Sakil"],
15:58:52 "city": ["Cal", "Mum", "Del", "Ban", "Del"],
15:58:52 "salary": [50000, 40000, 80000, 60000, 90000],
15:58:52 }
15:58:52 )
15:58:52 skills = pd.DataFrame(
15:58:52 {
15:58:52 "skill_id": [404, 405, 406, 407, 408],
15:58:52 "emp_id": [103, 101, 105, 102, 101],
15:58:52 "skill_name": ["Dask", "Spark", "C", "Python", "R"],
15:58:52 }
15:58:52 )
15:58:52
15:58:52 if engine == "cudf":
15:58:52 emp = cudf.from_pandas(emp)
15:58:52 skills = cudf.from_pandas(skills)
15:58:52 dd_emp = dask_cudf.from_cudf(emp, npartitions=parts[0])
15:58:52 dd_skills = dask_cudf.from_cudf(skills, npartitions=parts[1])
15:58:52 else:
15:58:52 dd_emp = dd.from_pandas(emp, npartitions=parts[0])
15:58:52 dd_skills = dd.from_pandas(skills, npartitions=parts[1])
15:58:52
15:58:52 expect = emp.merge(skills, on="emp_id", how=how).sort_values(["emp_id"])
15:58:52 > result = dd_emp.merge(dd_skills, on="emp_id", how=how).sort_values(["emp_id"])
15:58:52
15:58:52 dask/dataframe/tests/test_multi.py:919:
15:58:52 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
15:58:52 /opt/conda/envs/dask/lib/python3.8/site-packages/dask_cudf/core.py:139: in merge
15:58:52 return super().merge(other, on=on, shuffle="tasks", **kwargs)
15:58:52 dask/dataframe/core.py:4620: in merge
15:58:52 return merge(
15:58:52 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
15:58:52
15:58:52 left = <dask_cudf.DataFrame | 2 tasks | 2 npartitions>
15:58:52 right = <dask_cudf.DataFrame | 1 tasks | 1 npartitions>, how = 'leftsemi'
15:58:52 on = None, left_on = 'emp_id', right_on = 'emp_id', left_index = False
15:58:52 right_index = False, suffixes = ('_x', '_y'), indicator = False
15:58:52 npartitions = None, shuffle = 'tasks', max_branch = None, broadcast = None
15:58:52
15:58:52 @wraps(pd.merge)
15:58:52 def merge(
15:58:52 left,
15:58:52 right,
15:58:52 how="inner",
15:58:52 on=None,
15:58:52 left_on=None,
15:58:52 right_on=None,
15:58:52 left_index=False,
15:58:52 right_index=False,
15:58:52 suffixes=("_x", "_y"),
15:58:52 indicator=False,
15:58:52 npartitions=None,
15:58:52 shuffle=None,
15:58:52 max_branch=None,
15:58:52 broadcast=None,
15:58:52 ):
15:58:52 for o in [on, left_on, right_on]:
15:58:52 if isinstance(o, _Frame):
15:58:52 raise NotImplementedError(
15:58:52 "Dask collections not currently allowed in merge columns"
15:58:52 )
15:58:52 if not on and not left_on and not right_on and not left_index and not right_index:
15:58:52 on = [c for c in left.columns if c in right.columns]
15:58:52 if not on:
15:58:52 left_index = right_index = True
15:58:52
15:58:52 if on and not left_on and not right_on:
15:58:52 left_on = right_on = on
15:58:52 on = None
15:58:52
15:58:52 supported_how = ("left", "right", "outer", "inner")
15:58:52 if how not in supported_how:
15:58:52 > raise ValueError(
15:58:52 f"dask.dataframe.merge does not support how='{how}'. Options are: {supported_how}"
15:58:52 )
15:58:52 E ValueError: dask.dataframe.merge does not support how='leftsemi'. Options are: ('left', 'right', 'outer', 'inner')
15:58:52
15:58:52 dask/dataframe/multi.py:498: ValueErrorIt looks like cuDF's merge(...) supports additional options for how= (e.g. "leftsemi"), though the corresponding API docs say {‘left’, ‘outer’, ‘inner’} are the supported options. @rjzamora @jakirkham can you comment on what values for how= cuDF supports?
|
cc @galipremsagar (in case you have thoughts here 🙂) |
|
Fixed the merge conflicts but we are still having issues with the gpuCI, see #8138 (review) @rjzamora would you be able to comment on this, and suggest a possible approach? |
|
It looks like |
|
JFYI Rick's OOTO. So it might be a bit before we hear from here. Will check-in with him about this when he gets back |
dask/dataframe/multi.py
Outdated
| supported_how = ("left", "right", "outer", "inner") | ||
| if how not in supported_how: | ||
| raise ValueError( | ||
| f"dask.dataframe.merge does not support how='{how}'. Options are: {supported_how}" | ||
| ) | ||
|
|
There was a problem hiding this comment.
My only concern is that cudf also supports "leftsemi" and "leftanti", and dask-cudf is currently using this code. Not sure of the best way to deal with this variation between pandas and cudf.
|
The "easiest" approach is probably to include "leftanti" and "leftsemi" in the list of supported options (as long as pandas raises a reasonable error). |
Yeah this would be possible: >>> import pandas as pd
>>> df = pd.DataFrame({'a':[1, 2, 3]})
>>> df
a
0 1
1 2
2 3
>>> df.merge(df, how='ll')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pandas/core/frame.py", line 9191, in merge
return merge(
File "/nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pandas/core/reshape/merge.py", line 120, in merge
return op.get_result()
File "/nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pandas/core/reshape/merge.py", line 714, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "/nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pandas/core/reshape/merge.py", line 965, in _get_join_info
(left_indexer, right_indexer) = self._get_join_indexers()
File "/nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pandas/core/reshape/merge.py", line 939, in _get_join_indexers
return get_join_indexers(
File "/nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pandas/core/reshape/merge.py", line 1495, in get_join_indexers
join_func = {
KeyError: 'll'
>>> df.merge(df, how='leftanti')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pandas/core/frame.py", line 9191, in merge
return merge(
File "/nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pandas/core/reshape/merge.py", line 120, in merge
return op.get_result()
File "/nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pandas/core/reshape/merge.py", line 714, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "/nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pandas/core/reshape/merge.py", line 965, in _get_join_info
(left_indexer, right_indexer) = self._get_join_indexers()
File "/nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pandas/core/reshape/merge.py", line 939, in _get_join_indexers
return get_join_indexers(
File "/nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pandas/core/reshape/merge.py", line 1495, in get_join_indexers
join_func = {
KeyError: 'leftanti'
>>> df.merge(df, how='cross')
a_x a_y
0 1 1
1 1 2
2 1 3
3 2 1
4 2 2
5 2 3
6 3 1
7 3 2
8 3 3 |
|
It seems the easiest fix is to add "leftanti", "leftsemi" to the " "leftanti", "leftsemi" or not actually supported but they were added to this list since CuDF supports them and dask_cudf relies on this code." @jrbourbeau do you think this ^ is enough, or we should find a workaround. |
Implements the suggestion proposed by @choldgraf here dask#8227 (comment) to try and cut down out documentation build time
Co-authored-by: Jim Crist-Harif <jcristharif@gmail.com>
This PR moves the handling of custom sorting functions to `shuffle.sort_values`, so that usages of the internal `sort_values` function will not have to manually specify a default `sort_function` and `sort_function_kwargs`. This originated as a concern in the downstream implementation of this in rapidsai/cudf#9789
|
I think you can just go ahead and add them to the list. This is an improvement over the current state. |
|
Thanks for the ping @jsignell I push a change to include them. |
1 similar comment
|
Thanks for the ping @jsignell I push a change to include them. |
|
Hmmm something seems to be a little wrong with the diff on this. |
|
Oh shoot, I just noticed that. I might have merge main incorrectly on my local version. I can open a new PR that says it superseeds this one and get that solve. Unles there is a better way of doing this. |
1 similar comment
|
Oh shoot, I just noticed that. I might have merge main incorrectly on my local version. I can open a new PR that says it superseeds this one and get that solve. Unles there is a better way of doing this. |
|
Superseded by #8818, closing. |
black dask/flake8 dask/isort daskThis PR raises an exception with a better message when attempting to perform a merge with a type
how="something"that it is not implemented or doesn't exist. For example, the code from issue #8119dd.merge(left, right, how="cross")currently fails with:due to
how="cross"not being implemented.with this PR the traceback looks like: