Skip to content

Error when reading airline dataset #1128

@thenomemac

Description

@thenomemac

I'm trying to load the airline dataset http://stat-computing.org/dataexpo/2009/the-data.html. Most years will read in fine with `dd.read_csv(). However, only years 2001 and 2002 fail with the following error.

---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-78-de36e91a22ed> in <module>()
      1 #note won't work with ~/
----> 2 df = dd.read_csv('/home/thenome/airline/2002.csv.bz2', header=0, names=cols, dtype=dtypes)

/home/thenome/anaconda3/lib/python3.5/site-packages/dask/dataframe/io.py in read_csv(fn, **kwargs)
    213     kwargs = kwargs.copy()
    214 
--> 215     head, kwargs = _fill_kwargs(fn, **kwargs)
    216 
    217     # Handle glob strings

/home/thenome/anaconda3/lib/python3.5/site-packages/dask/dataframe/io.py in _fill_kwargs(fn, **kwargs)
    175     kwargs = _clean_kwargs(kwargs)
    176     try:
--> 177         head = pd.read_csv(fn, **assoc(kwargs, 'nrows', sample_nrows))
    178     except StopIteration:
    179         head = pd.read_csv(fn, **kwargs)

/home/thenome/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    527                     skip_blank_lines=skip_blank_lines)
    528 
--> 529         return _read(filepath_or_buffer, kwds)
    530 
    531     parser_f.__name__ = name

/home/thenome/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    299                                   " together yet.")
    300     elif nrows is not None:
--> 301         return parser.read(nrows)
    302     elif chunksize or iterator:
    303         return parser

/home/thenome/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
    761                 raise ValueError('skip_footer not supported for iteration')
    762 
--> 763         ret = self._engine.read(nrows)
    764 
    765         if self.options.get('as_recarray'):

/home/thenome/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
   1211     def read(self, nrows=None):
   1212         try:
-> 1213             data = self._reader.read(nrows)
   1214         except StopIteration:
   1215             if self._first_chunk:

pandas/parser.pyx in pandas.parser.TextReader.read (pandas/parser.c:7988)()

pandas/parser.pyx in pandas.parser.TextReader._read_low_memory (pandas/parser.c:8444)()

pandas/parser.pyx in pandas.parser.TextReader._read_rows (pandas/parser.c:9261)()

pandas/parser.pyx in pandas.parser.TextReader._convert_column_data (pandas/parser.c:10654)()

pandas/parser.pyx in pandas.parser.TextReader._convert_tokens (pandas/parser.c:11333)()

pandas/parser.pyx in pandas.parser.TextReader._convert_with_dtype (pandas/parser.c:12976)()

pandas/parser.pyx in pandas.parser.TextReader._string_convert (pandas/parser.c:13222)()

pandas/parser.pyx in pandas.parser._string_box_utf8 (pandas/parser.c:18598)()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe4 in position 4: invalid continuation byte

I'm at a total loss. Any suggestions.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions