Skip to content

Flaky test_append_with_partition #7369

@jrbourbeau

Description

@jrbourbeau

Over in #7365 we saw test_append_with_partition failed even though the changes in that PR are unrelated

Traceback:
2021-03-10T23:01:40.1108257Z ___________________ test_append_with_partition[fastparquet] ____________________
2021-03-10T23:01:40.1109561Z [gw2] linux -- Python 3.8.8 /usr/share/miniconda3/envs/test-environment/bin/python
2021-03-10T23:01:40.1110274Z 
2021-03-10T23:01:40.1111387Z tmpdir = local('/tmp/pytest-of-runner/pytest-0/popen-gw2/test_append_with_partition_fas0')
2021-03-10T23:01:40.1112520Z engine = 'fastparquet'
2021-03-10T23:01:40.1112987Z 
2021-03-10T23:01:40.1113638Z     def test_append_with_partition(tmpdir, engine):
2021-03-10T23:01:40.1114685Z         # check_fastparquet()
2021-03-10T23:01:40.1115302Z         tmp = str(tmpdir)
2021-03-10T23:01:40.1115890Z         df0 = pd.DataFrame(
2021-03-10T23:01:40.1116440Z             {
2021-03-10T23:01:40.1117015Z                 "lat": np.arange(0, 10, dtype="int64"),
2021-03-10T23:01:40.1117733Z                 "lon": np.arange(10, 20, dtype="int64"),
2021-03-10T23:01:40.1118705Z                 "value": np.arange(100, 110, dtype="int64"),
2021-03-10T23:01:40.1119313Z             }
2021-03-10T23:01:40.1119792Z         )
2021-03-10T23:01:40.1120343Z         df0.index.name = "index"
2021-03-10T23:01:40.1120996Z         df1 = pd.DataFrame(
2021-03-10T23:01:40.1121524Z             {
2021-03-10T23:01:40.1122123Z                 "lat": np.arange(10, 20, dtype="int64"),
2021-03-10T23:01:40.1122823Z                 "lon": np.arange(10, 20, dtype="int64"),
2021-03-10T23:01:40.1123559Z                 "value": np.arange(120, 130, dtype="int64"),
2021-03-10T23:01:40.1124144Z             }
2021-03-10T23:01:40.1124603Z         )
2021-03-10T23:01:40.1125146Z         df1.index.name = "index"
2021-03-10T23:01:40.1125873Z         dd_df0 = dd.from_pandas(df0, npartitions=1)
2021-03-10T23:01:40.1126653Z         dd_df1 = dd.from_pandas(df1, npartitions=1)
2021-03-10T23:01:40.1127528Z         dd.to_parquet(dd_df0, tmp, partition_on=["lon"], engine=engine)
2021-03-10T23:01:40.1128717Z         dd.to_parquet(
2021-03-10T23:01:40.1129177Z             dd_df1,
2021-03-10T23:01:40.1129651Z             tmp,
2021-03-10T23:01:40.1130204Z             partition_on=["lon"],
2021-03-10T23:01:40.1130767Z             append=True,
2021-03-10T23:01:40.1131360Z             ignore_divisions=True,
2021-03-10T23:01:40.1131957Z             engine=engine,
2021-03-10T23:01:40.1132464Z         )
2021-03-10T23:01:40.1132879Z     
2021-03-10T23:01:40.1133382Z >       out = dd.read_parquet(
2021-03-10T23:01:40.1134151Z             tmp, engine=engine, index="index", gather_statistics=True
2021-03-10T23:01:40.1134854Z         ).compute()
2021-03-10T23:01:40.1135230Z 
2021-03-10T23:01:40.1135778Z dask/dataframe/io/tests/test_parquet.py:690: 
2021-03-10T23:01:40.1136477Z _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
2021-03-10T23:01:40.1137077Z dask/base.py:283: in compute
2021-03-10T23:01:40.1137796Z     (result,) = compute(self, traverse=False, **kwargs)
2021-03-10T23:01:40.1138526Z dask/base.py:565: in compute
2021-03-10T23:01:40.1139190Z     results = schedule(dsk, keys, **kwargs)
2021-03-10T23:01:40.1139877Z dask/threaded.py:76: in get
2021-03-10T23:01:40.1140629Z     results = get_async(
2021-03-10T23:01:40.1141239Z dask/local.py:487: in get_async
2021-03-10T23:01:40.1141860Z     raise_exception(exc, tb)
2021-03-10T23:01:40.1142501Z dask/local.py:317: in reraise
2021-03-10T23:01:40.1143051Z     raise exc
2021-03-10T23:01:40.1143642Z dask/local.py:222: in execute_task
2021-03-10T23:01:40.1144308Z     result = _execute_task(task, data)
2021-03-10T23:01:40.1144983Z dask/core.py:121: in _execute_task
2021-03-10T23:01:40.1145690Z     return func(*(_execute_task(a, cache) for a in args))
2021-03-10T23:01:40.1146534Z dask/dataframe/io/parquet/core.py:383: in read_parquet_part
2021-03-10T23:01:40.1147202Z     dfs = [
2021-03-10T23:01:40.1147854Z dask/dataframe/io/parquet/core.py:384: in <listcomp>
2021-03-10T23:01:40.1148734Z     func(fs, rg, columns.copy(), index, **toolz.merge(kwargs, kw))
2021-03-10T23:01:40.1149717Z dask/dataframe/io/parquet/fastparquet.py:793: in read_partition
2021-03-10T23:01:40.1150567Z     parquet_file.read_row_group_file(
2021-03-10T23:01:40.1152128Z /usr/share/miniconda3/envs/test-environment/lib/python3.8/site-packages/fastparquet/api.py:210: in read_row_group_file
2021-03-10T23:01:40.1153238Z     core.read_row_group_file(
2021-03-10T23:01:40.1154691Z /usr/share/miniconda3/envs/test-environment/lib/python3.8/site-packages/fastparquet/core.py:303: in read_row_group_file
2021-03-10T23:01:40.1155985Z     return read_row_group(f, rg, columns, categories, schema_helper, cats,
2021-03-10T23:01:40.1156776Z _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
2021-03-10T23:01:40.1157176Z 
2021-03-10T23:01:40.1158462Z file = <_io.BufferedReader name='/tmp/pytest-of-runner/pytest-0/popen-gw2/test_append_with_partition_fas0/lon=19/part.1.parquet'>
2021-03-10T23:01:40.1160253Z rg = <class 'fastparquet.parquet_thrift.parquet.ttypes.RowGroup'>
2021-03-10T23:01:40.1162084Z columns: [<class 'fastparquet.parquet_thrift.parquet.ttyp...ompressed_size: 41
2021-03-10T23:01:40.1163245Z   total_uncompressed_size: None
2021-03-10T23:01:40.1163835Z   type: 2
2021-03-10T23:01:40.1164195Z 
2021-03-10T23:01:40.1164544Z ]
2021-03-10T23:01:40.1164906Z num_rows: 1
2021-03-10T23:01:40.1165338Z sorting_columns: None
2021-03-10T23:01:40.1165914Z total_byte_size: 117
2021-03-10T23:01:40.1166289Z 
2021-03-10T23:01:40.1167039Z columns = ['lat', 'value', 'lon', 'index'], categories = {}
2021-03-10T23:01:40.1167849Z schema_helper = <Parquet Schema with 4 entries>
2021-03-10T23:01:40.1168909Z cats = OrderedDict([('lon', [18])]), selfmade = True, index = ['index']
2021-03-10T23:01:40.1170080Z assign = {'index': array([9]), 'lat': array([19]), 'lon': array([0], dtype=int8), 'lon-catdef': [18]
2021-03-10T23:01:40.1170896Z Categories (1, int64): [18], ...}
2021-03-10T23:01:40.1171705Z scheme = 'hive', partition_meta = {}
2021-03-10T23:01:40.1172188Z 
2021-03-10T23:01:40.1172845Z     def read_row_group(file, rg, columns, categories, schema_helper, cats,
2021-03-10T23:01:40.1173745Z                        selfmade=False, index=None, assign=None,
2021-03-10T23:01:40.1174724Z                        scheme='hive', partition_meta=None):
2021-03-10T23:01:40.1175318Z         """
2021-03-10T23:01:40.1176260Z         Access row-group in a file and read some columns into a data-frame.
2021-03-10T23:01:40.1176967Z         """
2021-03-10T23:01:40.1177568Z         partition_meta = partition_meta or {}
2021-03-10T23:01:40.1178218Z         if assign is None:
2021-03-10T23:01:40.1179157Z             raise RuntimeError('Going with pre-allocation!')
2021-03-10T23:01:40.1180147Z         read_row_group_arrays(file, rg, columns, categories, schema_helper,
2021-03-10T23:01:40.1185324Z                               cats, selfmade, assign=assign)
2021-03-10T23:01:40.1185817Z     
2021-03-10T23:01:40.1186187Z         for cat in cats:
2021-03-10T23:01:40.1186849Z             if scheme == 'hive':
2021-03-10T23:01:40.1187479Z                 s = ex_from_sep('/')
2021-03-10T23:01:40.1188087Z                 partitions = s.findall(rg.columns[0].file_path)
2021-03-10T23:01:40.1188803Z             else:
2021-03-10T23:01:40.1189549Z                 partitions = [('dir%i' % i, v) for (i, v) in enumerate(
2021-03-10T23:01:40.1190424Z                     rg.columns[0].file_path.split('/')[:-1])]
2021-03-10T23:01:40.1191075Z             key, val = [p for p in partitions if p[0] == cat][0]
2021-03-10T23:01:40.1191784Z             val = val_to_num(val, meta=partition_meta.get(key))
2021-03-10T23:01:40.1192419Z >           assign[cat][:] = cats[cat].index(val)
2021-03-10T23:01:40.1192969Z E           ValueError: 19 is not in list
2021-03-10T23:01:40.1193388Z 
2021-03-10T23:01:40.1218772Z /usr/share/miniconda3/envs/test-environment/lib/python3.8/site-packages/fastparquet/core.py:366: ValueError

Metadata

Metadata

Assignees

No one assigned

    Labels

    testsUnit tests and/or continuous integration

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions