Error when reading airline dataset

I'm trying to load the airline dataset [http://stat-computing.org/dataexpo/2009/the-data.html](http://www.stat.purdue.edu/~sguha/rhipe/doc/html/airline.html). Most years will read in fine with `dd.read_csv(). However, only years 2001 and 2002 fail with the following error. 

```
---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-78-de36e91a22ed> in <module>()
      1 #note won't work with ~/
----> 2 df = dd.read_csv('/home/thenome/airline/2002.csv.bz2', header=0, names=cols, dtype=dtypes)

/home/thenome/anaconda3/lib/python3.5/site-packages/dask/dataframe/io.py in read_csv(fn, **kwargs)
    213     kwargs = kwargs.copy()
    214 
--> 215     head, kwargs = _fill_kwargs(fn, **kwargs)
    216 
    217     # Handle glob strings

/home/thenome/anaconda3/lib/python3.5/site-packages/dask/dataframe/io.py in _fill_kwargs(fn, **kwargs)
    175     kwargs = _clean_kwargs(kwargs)
    176     try:
--> 177         head = pd.read_csv(fn, **assoc(kwargs, 'nrows', sample_nrows))
    178     except StopIteration:
    179         head = pd.read_csv(fn, **kwargs)

/home/thenome/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    527                     skip_blank_lines=skip_blank_lines)
    528 
--> 529         return _read(filepath_or_buffer, kwds)
    530 
    531     parser_f.__name__ = name

/home/thenome/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    299                                   " together yet.")
    300     elif nrows is not None:
--> 301         return parser.read(nrows)
    302     elif chunksize or iterator:
    303         return parser

/home/thenome/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
    761                 raise ValueError('skip_footer not supported for iteration')
    762 
--> 763         ret = self._engine.read(nrows)
    764 
    765         if self.options.get('as_recarray'):

/home/thenome/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
   1211     def read(self, nrows=None):
   1212         try:
-> 1213             data = self._reader.read(nrows)
   1214         except StopIteration:
   1215             if self._first_chunk:

pandas/parser.pyx in pandas.parser.TextReader.read (pandas/parser.c:7988)()

pandas/parser.pyx in pandas.parser.TextReader._read_low_memory (pandas/parser.c:8444)()

pandas/parser.pyx in pandas.parser.TextReader._read_rows (pandas/parser.c:9261)()

pandas/parser.pyx in pandas.parser.TextReader._convert_column_data (pandas/parser.c:10654)()

pandas/parser.pyx in pandas.parser.TextReader._convert_tokens (pandas/parser.c:11333)()

pandas/parser.pyx in pandas.parser.TextReader._convert_with_dtype (pandas/parser.c:12976)()

pandas/parser.pyx in pandas.parser.TextReader._string_convert (pandas/parser.c:13222)()

pandas/parser.pyx in pandas.parser._string_box_utf8 (pandas/parser.c:18598)()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe4 in position 4: invalid continuation byte

```

I'm at a total loss. Any suggestions.


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Error when reading airline dataset #1128

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

Error when reading airline dataset #1128

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions