Skip to content

read_hdf fails in dask even though it works in pandas #1174

@davclark

Description

@davclark

dask version 0.9.0, pandas 0.18.1 (most recent from conda as of posting)

Grab a 1 MB fake TAQ file from here. (aside: same data is ~1/4 MB in zipped fixed width - chunk sizes are probably dumb for this data.)

pd.read_hdf('small_test_data_public.h5', '/IXQAJE/no_suffix') works, dask.dataframe.read_hdf('small_test_data_public.h5', '/IXQAJE/no_suffix') fails with the following stack trace. I think it may be due to the attempt to read an empty dataframe of 0-length. If I read the intent correctly, it would probably make more sense to retrieve a pytables or h5py object which would provide the desired metadata without the weirdness around a 0-length read.

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-110-8ad7ff0d4733> in <module>()
----> 1 spy_dd = dd.read_hdf(fname, max_sym)

/home/dav/miniconda3/envs/TAQ/lib/python3.5/site-packages/dask/dataframe/io.py in read_hdf(pattern, key, start, stop, columns, chunksize, lock)
    559                                     columns=columns, chunksize=chunksize,
    560                                     lock=lock)
--> 561                    for path in paths])
    562 
    563 

/home/dav/miniconda3/envs/TAQ/lib/python3.5/site-packages/dask/dataframe/io.py in <listcomp>(.0)
    559                                     columns=columns, chunksize=chunksize,
    560                                     lock=lock)
--> 561                    for path in paths])
    562 
    563 

/home/dav/miniconda3/envs/TAQ/lib/python3.5/site-packages/dask/dataframe/io.py in _read_single_hdf(path, key, start, stop, columns, chunksize, lock)
    499     from .multi import concat
    500     return concat([one_path_one_key(path, k, start, s, columns, chunksize, lock)
--> 501                    for k, s in zip(keys, stops)])
    502 
    503 

/home/dav/miniconda3/envs/TAQ/lib/python3.5/site-packages/dask/dataframe/io.py in <listcomp>(.0)
    499     from .multi import concat
    500     return concat([one_path_one_key(path, k, start, s, columns, chunksize, lock)
--> 501                    for k, s in zip(keys, stops)])
    502 
    503 

/home/dav/miniconda3/envs/TAQ/lib/python3.5/site-packages/dask/dataframe/io.py in one_path_one_key(path, key, start, stop, columns, chunksize, lock)
    474         not contain any wildcards).
    475         """
--> 476         empty = pd.read_hdf(path, key, stop=0)
    477         if columns is not None:
    478             empty = empty[columns]

/home/dav/miniconda3/envs/TAQ/lib/python3.5/site-packages/pandas/io/pytables.py in read_hdf(path_or_buf, key, **kwargs)
    328                                  'multiple datasets.')
    329             key = keys[0]
--> 330         return store.select(key, auto_close=auto_close, **kwargs)
    331     except:
    332         # if there is an error, close the store

/home/dav/miniconda3/envs/TAQ/lib/python3.5/site-packages/pandas/io/pytables.py in select(self, key, where, start, stop, columns, iterator, chunksize, auto_close, **kwargs)
    678                            chunksize=chunksize, auto_close=auto_close)
    679 
--> 680         return it.get_result()
    681 
    682     def select_as_coordinates(

/home/dav/miniconda3/envs/TAQ/lib/python3.5/site-packages/pandas/io/pytables.py in get_result(self, coordinates)
   1362 
   1363         # directly return the result
-> 1364         results = self.func(self.start, self.stop, where)
   1365         self.close()
   1366         return results

/home/dav/miniconda3/envs/TAQ/lib/python3.5/site-packages/pandas/io/pytables.py in func(_start, _stop, _where)
    671             return s.read(start=_start, stop=_stop,
    672                           where=_where,
--> 673                           columns=columns, **kwargs)
    674 
    675         # create the iterator

/home/dav/miniconda3/envs/TAQ/lib/python3.5/site-packages/pandas/io/pytables.py in read(self, where, columns, **kwargs)
   4052 
   4053             block = make_block(values, placement=np.arange(len(cols_)))
-> 4054             mgr = BlockManager([block], [cols_, index_])
   4055             frames.append(DataFrame(mgr))
   4056 

/home/dav/miniconda3/envs/TAQ/lib/python3.5/site-packages/pandas/core/internals.py in __init__(self, blocks, axes, do_integrity_check, fastpath)
   2592 
   2593         if do_integrity_check:
-> 2594             self._verify_integrity()
   2595 
   2596         self._consolidate_check()

/home/dav/miniconda3/envs/TAQ/lib/python3.5/site-packages/pandas/core/internals.py in _verify_integrity(self)
   2802         for block in self.blocks:
   2803             if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:
-> 2804                 construction_error(tot_items, block.shape[1:], self.axes)
   2805         if len(self.items) != tot_items:
   2806             raise AssertionError('Number of manager items must equal union of '

/home/dav/miniconda3/envs/TAQ/lib/python3.5/site-packages/pandas/core/internals.py in construction_error(tot_items, block_shape, axes, e)
   3966         raise e
   3967     if block_shape[0] == 0:
-> 3968         raise ValueError("Empty data passed with indices specified.")
   3969     raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
   3970         passed, implied))

ValueError: Empty data passed with indices specified.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions