Skip to content

BUG: read_csv with engine=pyarrow and numpy-nullable dtype #62053

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -814,6 +814,7 @@ I/O
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
Expand Down
98 changes: 82 additions & 16 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
from typing import TYPE_CHECKING
import warnings

import numpy as np

from pandas._config import using_string_dtype

from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency
from pandas.errors import (
Expand All @@ -11,9 +15,17 @@
)
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import pandas_dtype
from pandas.core.dtypes.common import (
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
BaseMaskedDtype,
)
from pandas.core.dtypes.inference import is_integer

from pandas.core.arrays.string_ import StringDtype

from pandas.io._util import arrow_table_to_pandas
from pandas.io.parsers.base_parser import ParserBase

Expand Down Expand Up @@ -140,20 +152,7 @@ def handle_warning(invalid_row) -> str:
"encoding": self.encoding,
}

def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
"""
Processes data read in based on kwargs.

Parameters
----------
frame: DataFrame
The DataFrame to process.

Returns
-------
DataFrame
The processed DataFrame.
"""
def _finalize_column_names(self, frame: DataFrame) -> DataFrame:
num_cols = len(frame.columns)
multi_index_named = True
if self.header is None:
Expand Down Expand Up @@ -196,6 +195,23 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
if self.header is None and not multi_index_named:
frame.index.names = [None] * len(frame.index.names)

return frame

def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
"""
Processes data read in based on kwargs.

Parameters
----------
frame: DataFrame
The DataFrame to process.

Returns
-------
DataFrame
The processed DataFrame.
"""

if self.dtype is not None:
# Ignore non-existent columns from dtype mapping
# like other parsers do
Expand Down Expand Up @@ -282,14 +298,64 @@ def read(self) -> DataFrame:

table = table.cast(new_schema)

workaround = False
pass_backend = dtype_backend
if self.dtype is not None and dtype_backend != "pyarrow":
# We pass dtype_backend="pyarrow" and subsequently cast
# to avoid lossy conversion e.g. GH#56136
workaround = True
pass_backend = "numpy_nullable"

with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"make_block is deprecated",
DeprecationWarning,
)
frame = arrow_table_to_pandas(
table, dtype_backend=dtype_backend, null_to_int64=True
table, dtype_backend=pass_backend, null_to_int64=True
)

frame = self._finalize_column_names(frame)

if workaround and dtype_backend != "numpy_nullable":
old_dtype = self.dtype
if not isinstance(old_dtype, dict):
# e.g. test_categorical_dtype_utf16
old_dtype = dict.fromkeys(frame.columns, old_dtype)

# _finalize_pandas_output will call astype, but we need to make
# sure all keys are populated appropriately.
new_dtype = {}
for key in frame.columns:
ser = frame[key]
if isinstance(ser.dtype, BaseMaskedDtype):
new_dtype[key] = ser.dtype.numpy_dtype
if (
key in old_dtype
and not using_string_dtype()
and is_string_dtype(old_dtype[key])
and not isinstance(old_dtype[key], StringDtype)
and ser.array._hasna
):
# Cast to make sure we get "NaN" string instead of "NA"
frame[key] = ser.astype(old_dtype[key])
frame.loc[ser.isna(), key] = np.nan
old_dtype[key] = object # Avoid re-casting
elif isinstance(ser.dtype, StringDtype):
# We cast here in case the user passed "category" in
# order to get the correct dtype.categories.dtype
# e.g. test_categorical_dtype_utf16
if not using_string_dtype():
sdt = np.dtype(object)
frame[key] = ser.astype(sdt)
frame.loc[ser.isna(), key] = np.nan
else:
sdt = StringDtype(na_value=np.nan) # type: ignore[assignment]
frame[key] = frame[key].astype(sdt)
new_dtype[key] = sdt

new_dtype.update(old_dtype)
self.dtype = new_dtype

return self._finalize_pandas_output(frame)
4 changes: 0 additions & 4 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,9 +518,6 @@ def test_dtype_backend_pyarrow(all_parsers, request):
tm.assert_frame_equal(result, expected)


# pyarrow engine failing:
# https://github.com/pandas-dev/pandas/issues/56136
@pytest.mark.usefixtures("pyarrow_xfail")
def test_ea_int_avoid_overflow(all_parsers):
# GH#32134
parser = all_parsers
Expand Down Expand Up @@ -594,7 +591,6 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_accurate_parsing_of_large_integers(all_parsers):
# GH#52505
data = """SYMBOL,MOMENT,ID,ID_DEAL
Expand Down
17 changes: 14 additions & 3 deletions pandas/tests/io/parser/test_na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,11 +670,14 @@ def test_inf_na_values_with_int_index(all_parsers):
tm.assert_frame_equal(out, expected)


@xfail_pyarrow # mismatched shape
@pytest.mark.parametrize("na_filter", [True, False])
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter, request):
# see gh-20377
parser = all_parsers
if parser.engine == "pyarrow" and na_filter is False:
mark = pytest.mark.xfail(reason="mismatched shape")
request.applymarker(mark)

data = "a,b,c\n1,,3\n4,5,6"

# na_filter=True --> missing value becomes NaN.
Expand Down Expand Up @@ -798,7 +801,15 @@ def test_bool_and_nan_to_int(all_parsers):
True
False
"""
with pytest.raises(ValueError, match="convert|NoneType"):
msg = (
"cannot safely convert passed user dtype of int(64|32) for "
"<class 'numpy.bool_?'> dtyped data in column 0 due to NA values"
)
if parser.engine == "python":
msg = "Unable to convert column 0 to type int(64|32)"
elif parser.engine == "pyarrow":
msg = r"cannot convert NA to integer"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dtype="int")


Expand Down
Loading