-
-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Closed
Description
I'm trying to load the airline dataset http://stat-computing.org/dataexpo/2009/the-data.html. Most years will read in fine with `dd.read_csv(). However, only years 2001 and 2002 fail with the following error.
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-78-de36e91a22ed> in <module>()
1 #note won't work with ~/
----> 2 df = dd.read_csv('/home/thenome/airline/2002.csv.bz2', header=0, names=cols, dtype=dtypes)
/home/thenome/anaconda3/lib/python3.5/site-packages/dask/dataframe/io.py in read_csv(fn, **kwargs)
213 kwargs = kwargs.copy()
214
--> 215 head, kwargs = _fill_kwargs(fn, **kwargs)
216
217 # Handle glob strings
/home/thenome/anaconda3/lib/python3.5/site-packages/dask/dataframe/io.py in _fill_kwargs(fn, **kwargs)
175 kwargs = _clean_kwargs(kwargs)
176 try:
--> 177 head = pd.read_csv(fn, **assoc(kwargs, 'nrows', sample_nrows))
178 except StopIteration:
179 head = pd.read_csv(fn, **kwargs)
/home/thenome/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
527 skip_blank_lines=skip_blank_lines)
528
--> 529 return _read(filepath_or_buffer, kwds)
530
531 parser_f.__name__ = name
/home/thenome/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
299 " together yet.")
300 elif nrows is not None:
--> 301 return parser.read(nrows)
302 elif chunksize or iterator:
303 return parser
/home/thenome/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
761 raise ValueError('skip_footer not supported for iteration')
762
--> 763 ret = self._engine.read(nrows)
764
765 if self.options.get('as_recarray'):
/home/thenome/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
1211 def read(self, nrows=None):
1212 try:
-> 1213 data = self._reader.read(nrows)
1214 except StopIteration:
1215 if self._first_chunk:
pandas/parser.pyx in pandas.parser.TextReader.read (pandas/parser.c:7988)()
pandas/parser.pyx in pandas.parser.TextReader._read_low_memory (pandas/parser.c:8444)()
pandas/parser.pyx in pandas.parser.TextReader._read_rows (pandas/parser.c:9261)()
pandas/parser.pyx in pandas.parser.TextReader._convert_column_data (pandas/parser.c:10654)()
pandas/parser.pyx in pandas.parser.TextReader._convert_tokens (pandas/parser.c:11333)()
pandas/parser.pyx in pandas.parser.TextReader._convert_with_dtype (pandas/parser.c:12976)()
pandas/parser.pyx in pandas.parser.TextReader._string_convert (pandas/parser.c:13222)()
pandas/parser.pyx in pandas.parser._string_box_utf8 (pandas/parser.c:18598)()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe4 in position 4: invalid continuation byte
I'm at a total loss. Any suggestions.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels