-
-
Notifications
You must be signed in to change notification settings - Fork 19.2k
WIP: Fix precision of DataFrame.combine and DataFrame.combine_first #62691
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c8ba339
f786763
ae99b3d
6e77fc9
ea7d2ba
301d85f
a6b461c
e15bde9
451621b
4fdc459
ef662a0
fefadcb
f80917d
bf69fad
016c64e
2928cee
1a53d48
7e6837f
444deaa
49ff1a5
747e8bc
c527bc0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -144,6 +144,10 @@ | |
| PeriodArray, | ||
| TimedeltaArray, | ||
| ) | ||
| from pandas.core.arrays.integer import ( | ||
| Int64Dtype, | ||
| UInt64Dtype, | ||
| ) | ||
| from pandas.core.arrays.sparse import SparseFrameAccessor | ||
| from pandas.core.arrays.string_ import StringDtype | ||
| from pandas.core.construction import ( | ||
|
|
@@ -9025,6 +9029,67 @@ def combine( | |
| 1 0.0 3.0 1.0 | ||
| 2 NaN 3.0 1.0 | ||
| """ | ||
|
|
||
| # GH#60128 Integers n where |n| > 2**53 would lose precision after align | ||
| # upcasts them to float. Avoid lossy conversion by preemptively promoting | ||
| # int64 and uint64 to their nullable ExtensionDtypes, Int64 and UInt64. | ||
| def _ensure_nullable_int64_dtypes(df: DataFrame) -> DataFrame: | ||
| """Promote int64/uint64 DataFrame columns to Int64/UInt64.""" | ||
| cast_map: dict[IndexLabel, DtypeObj] = {} | ||
| for col, dt in df.dtypes.items(): | ||
| if dt == np.int64: | ||
| cast_map[col] = Int64Dtype() | ||
| elif dt == np.uint64: | ||
| cast_map[col] = UInt64Dtype() | ||
|
|
||
| if cast_map: | ||
| df = df.astype(cast_map) | ||
| return df | ||
|
|
||
| # To maintain backwards compatibility, downcast the pre-promoted int64 | ||
| # columns of the combined DataFrame back to how they would have resolved. | ||
| # Consider just embracing nullable ExtensionDtypes instead, though. | ||
| def _revert_int64_dtype_promotion( | ||
| self_orig: DataFrame, other_orig: DataFrame, combined_df: DataFrame | ||
| ) -> DataFrame: | ||
| """Resolve the combined dtypes according to the original dtypes.""" | ||
| cast_map: dict[IndexLabel, DtypeObj] = {} | ||
| for col in combined_df.columns: | ||
| ser = combined_df[col] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is not going to be robust to non-unique columns. can you use |
||
| orig_dt_self = self_orig.dtypes.get(col) | ||
| orig_dt_other = other_orig.dtypes.get(col) | ||
|
|
||
| was_promoted = (orig_dt_self in [np.int64, np.uint64]) or ( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no risk for smaller int widths? |
||
| orig_dt_other in [np.int64, np.uint64] | ||
| ) | ||
|
|
||
| if was_promoted: | ||
| dtypes_to_resolve = [ | ||
| dt for dt in (orig_dt_self, orig_dt_other) if dt is not None | ||
| ] | ||
| if dtypes_to_resolve: | ||
| if isna(ser).any(): | ||
| # If there are NAs, we can't safely downcast back | ||
| # to int. Previously, we left the data as float64. | ||
| # However, converting large integers to float can | ||
| # lose precision, even if it's not immediately | ||
| # obvious (since we don't cast back). Consider | ||
| # embracing nullable ExtensionDtypes instead | ||
| # and dropping this whole restoration step. | ||
| dtypes_to_resolve.append(np.dtype(np.float64)) | ||
| target_type = find_common_type(dtypes_to_resolve) | ||
| cast_map[col] = target_type | ||
|
|
||
| if cast_map: | ||
| combined_df = combined_df.astype(cast_map) | ||
| return combined_df | ||
|
|
||
| # store originals and prepare for align | ||
| self_orig = self | ||
| other_orig = other | ||
| self = _ensure_nullable_int64_dtypes(self) | ||
| other = _ensure_nullable_int64_dtypes(other) | ||
|
|
||
| other_idxlen = len(other.index) # save for compare | ||
| other_columns = other.columns | ||
|
|
||
|
|
@@ -9092,6 +9157,9 @@ def combine( | |
|
|
||
| # convert_objects just in case | ||
| frame_result = self._constructor(result, index=new_index, columns=new_columns) | ||
| frame_result = _revert_int64_dtype_promotion( | ||
| self_orig, other_orig, frame_result | ||
| ) | ||
| return frame_result.__finalize__(self, method="combine") | ||
|
|
||
| def combine_first(self, other: DataFrame) -> DataFrame: | ||
|
|
@@ -9141,20 +9209,10 @@ def combine_first(self, other: DataFrame) -> DataFrame: | |
| 1 0.0 3.0 1.0 | ||
| 2 NaN 3.0 1.0 | ||
| """ | ||
| from pandas.core.computation import expressions | ||
|
|
||
| def combiner(x: Series, y: Series): | ||
| mask = x.isna()._values | ||
|
|
||
| x_values = x._values | ||
| y_values = y._values | ||
|
|
||
| # If the column y in other DataFrame is not in first DataFrame, | ||
| # just return y_values. | ||
| if y.name not in self.columns: | ||
| return y_values | ||
|
|
||
| return expressions.where(mask, y_values, x_values) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does not using expressions.where have a perf impact? |
||
| # GH#60128 The combiner is supposed to preserve EA Dtypes. | ||
| return y if y.name not in self.columns else y.where(x.isna(), x) | ||
|
|
||
| if len(other) == 0: | ||
| combined = self.reindex( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -398,6 +398,21 @@ def test_combine_first_string_dtype_only_na(self, nullable_string_dtype): | |
| ).set_index(["a", "b"]) | ||
| tm.assert_frame_equal(result, expected) | ||
|
|
||
| @pytest.mark.parametrize( | ||
| "wide_val, dtype", | ||
| ( | ||
| (1666880195890293744, "uint64"), | ||
| (-1666880195890293744, "int64"), | ||
| ), | ||
| ) | ||
| def test_combine_first_preserve_precision(self, wide_val, dtype): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. looks like you made changes to both combine and combine_first, but only a test for combine_first. can you test the fixed bug in combine? |
||
| # GH#60128 | ||
| df1 = DataFrame({"A": [wide_val, 5]}, dtype=dtype) | ||
| df2 = DataFrame({"A": [6, 7, wide_val]}, dtype=dtype) | ||
| result = df1.combine_first(df2) | ||
| expected = DataFrame({"A": [wide_val, 5, wide_val]}, dtype=dtype) | ||
| tm.assert_frame_equal(result, expected) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "scalar1, scalar2", | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
align only upcasts to float when the frames have different indexes right? would it be simpler to just warn users and tell them to do alignment before calling .combine?