Skip to content

Error when passing pd.NA value in a data column with pandas nullable int dtype #2844

@weiji14

Description

@weiji14

Description of the problem

Running pygmt.info on a pandas.Series object with a pd.NA value (that uses pandas' nullable integer datatype) raises an error like ValueError: Converting an integer to a NumPy datetime requires a specified unit. This is caused by the _check_dtype_and_dim function here

pygmt/pygmt/clib/session.py

Lines 789 to 844 in 7aac7fd

def _check_dtype_and_dim(self, array, ndim):
"""
Check that a numpy array has the given number of dimensions and is a
valid data type.
Parameters
----------
array : numpy.ndarray
The array to be tested.
ndim : int
The desired number of array dimensions.
Returns
-------
gmt_type : int
The GMT constant value representing this data type.
Raises
------
GMTInvalidInput
If the array has the wrong number of dimensions or
is an unsupported data type.
Examples
--------
>>> import numpy as np
>>> data = np.array([1, 2, 3], dtype="float64")
>>> with Session() as ses:
... gmttype = ses._check_dtype_and_dim(data, ndim=1)
... gmttype == ses["GMT_DOUBLE"]
...
True
>>> data = np.ones((5, 2), dtype="float32")
>>> with Session() as ses:
... gmttype = ses._check_dtype_and_dim(data, ndim=2)
... gmttype == ses["GMT_FLOAT"]
...
True
"""
# Check that the array has the given number of dimensions
if array.ndim != ndim:
raise GMTInvalidInput(
f"Expected a numpy {ndim}-D array, got {array.ndim}-D."
)
# Check that the array has a valid/known data type
if array.dtype.type not in DTYPES:
try:
# Try to convert any unknown numpy data types to np.datetime64
array = array_to_datetime(array)
except ValueError as e:
raise GMTInvalidInput(
f"Unsupported numpy data type '{array.dtype.type}'."
) from e
return self[DTYPES[array.dtype.type]]

Maybe we need to update the dictionary here

DTYPES = {
np.int8: "GMT_CHAR",
np.int16: "GMT_SHORT",
np.int32: "GMT_INT",
np.int64: "GMT_LONG",
np.uint8: "GMT_UCHAR",
np.uint16: "GMT_USHORT",
np.uint32: "GMT_UINT",
np.uint64: "GMT_ULONG",
np.float32: "GMT_FLOAT",
np.float64: "GMT_DOUBLE",
np.str_: "GMT_TEXT",
np.datetime64: "GMT_DATETIME",
}

to include pd.Int32Dtype() and pd.Int64Dtype()? But what would we map these nullable dtypes to in GMT?

Minimal Complete Verifiable Example

import pandas as pd
import pygmt
    
series = pd.Series(data=[0, 4, pd.NA, 8, 6], dtype=pd.Int32Dtype())
output = pygmt.info(data=series)

Full error message

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File ~/Documents/pygmt/pygmt/clib/session.py:839, in Session._check_dtype_and_dim(self, array, ndim)
    837 try:
    838     # Try to convert any unknown numpy data types to np.datetime64
--> 839     array = array_to_datetime(array)
    840 except ValueError as e:

File ~/Documents/pygmt/pygmt/clib/conversion.py:327, in array_to_datetime(array)
    253 """
    254 Convert a 1-D datetime array from various types into numpy.datetime64.
    255 
   (...)
    325        '2018-01-01T00:00:00.000000'], dtype='datetime64[us]')
    326 """
--> 327 return np.asarray(array, dtype=np.datetime64)

ValueError: Converting an integer to a NumPy datetime requires a specified unit

The above exception was the direct cause of the following exception:

GMTInvalidInput                           Traceback (most recent call last)
Cell In[4], line 1
----> 1 output = pygmt.info(data=series)

File ~/Documents/pygmt/pygmt/helpers/decorators.py:600, in use_alias.<locals>.alias_decorator.<locals>.new_module(*args, **kwargs)
    593     msg = (
    594         "Parameters 'Y' and 'yshift' are deprecated since v0.8.0. "
    595         "and will be removed in v0.12.0. "
    596         "Use Figure.shift_origin(yshift=...) instead."
    597     )
    598     warnings.warn(msg, category=SyntaxWarning, stacklevel=2)
--> 600 return module_func(*args, **kwargs)

File ~/Documents/pygmt/pygmt/helpers/decorators.py:740, in kwargs_to_strings.<locals>.converter.<locals>.new_module(*args, **kwargs)
    738             kwargs[arg] = separators[fmt].join(f"{item}" for item in value)
    739 # Execute the original function and return its output
--> 740 return module_func(*args, **kwargs)

File ~/Documents/pygmt/pygmt/src/info.py:85, in info(data, **kwargs)
     83 file_context = lib.virtualfile_from_data(check_kind="vector", data=data)
     84 with GMTTempFile() as tmpfile:
---> 85     with file_context as fname:
     86         lib.call_module(
     87             module="info",
     88             args=build_arg_string(kwargs, infile=fname, outfile=tmpfile.name),
     89         )
     90     result = tmpfile.read()

File ~/mambaforge/envs/pygmt/lib/python3.12/contextlib.py:137, in _GeneratorContextManager.__enter__(self)
    135 del self.args, self.kwds, self.func
    136 try:
--> 137     return next(self.gen)
    138 except StopIteration:
    139     raise RuntimeError("generator didn't yield") from None

File ~/Documents/pygmt/pygmt/clib/session.py:1276, in Session.virtualfile_from_vectors(self, *vectors)
   1274 # Use put_vector for columns with numerical type data
   1275 for col, array in enumerate(arrays[:columns]):
-> 1276     self.put_vector(dataset, column=col, vector=array)
   1278 # Use put_strings for last column(s) with string type data
   1279 # Have to use modifier "GMT_IS_DUPLICATE" to duplicate the strings
   1280 string_arrays = arrays[columns:]

File ~/Documents/pygmt/pygmt/clib/session.py:888, in Session.put_vector(self, dataset, column, vector)
    847 r"""
    848 Attach a numpy 1-D array as a column on a GMT dataset.
    849 
   (...)
    880     status != 0.
    881 """
    882 c_put_vector = self.get_libgmt_func(
    883     "GMT_Put_Vector",
    884     argtypes=[ctp.c_void_p, ctp.c_void_p, ctp.c_uint, ctp.c_uint, ctp.c_void_p],
    885     restype=ctp.c_int,
    886 )
--> 888 gmt_type = self._check_dtype_and_dim(vector, ndim=1)
    889 if gmt_type in (self["GMT_TEXT"], self["GMT_DATETIME"]):
    890     vector_pointer = (ctp.c_char_p * len(vector))()

File ~/Documents/pygmt/pygmt/clib/session.py:841, in Session._check_dtype_and_dim(self, array, ndim)
    839         array = array_to_datetime(array)
    840     except ValueError as e:
--> 841         raise GMTInvalidInput(
    842             f"Unsupported numpy data type '{array.dtype.type}'."
    843         ) from e
    844 return self[DTYPES[array.dtype.type]]

GMTInvalidInput: Unsupported numpy data type '<class 'numpy.object_'>'.

System information

PyGMT information:
  version: v0.10.1.dev114+g88ce36d61.d20231203
System information:
  python: 3.12.0 | packaged by conda-forge | (main, Oct  3 2023, 08:43:22) [GCC 12.3.0]
  executable: /home/user/mambaforge/envs/pygmt/bin/python
  machine: Linux-6.5.0-4-amd64-x86_64-with-glibc2.37
Dependency information:
  numpy: 1.26.2
  pandas: 2.1.3
  xarray: 2023.11.0
  netCDF4: 1.6.5
  packaging: 23.2
  contextily: 1.4.0
  geopandas: 0.14.1
  ipython: None
  rioxarray: 0.15.0
  ghostscript: 9.54.0
GMT library information:
  binary version: 6.4.0
  cores: 16
  grid layout: rows
  image layout: 
  library path: /home/user/mambaforge/envs/pygmt/lib/libgmt.so
  padding: 2
  plugin dir: /home/user/mambaforge/envs/pygmt/lib/gmt/plugins
  share dir: /home/user/mambaforge/envs/pygmt/share/gmt
  version: 6.4.0

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions