diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 5ffda03fad80f..6714afe59143e 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -358,6 +358,26 @@ def time_category_size(self): self.draws.groupby(self.cats).size() +class FillNA: + def setup(self): + N = 100 + self.df = DataFrame( + {"group": [1] * N + [2] * N, "value": [np.nan, 1.0] * N} + ).set_index("group") + + def time_df_ffill(self): + self.df.groupby("group").fillna(method="ffill") + + def time_df_bfill(self): + self.df.groupby("group").fillna(method="bfill") + + def time_srs_ffill(self): + self.df.groupby("group")["value"].fillna(method="ffill") + + def time_srs_bfill(self): + self.df.groupby("group")["value"].fillna(method="bfill") + + class GroupByMethods: param_names = ["dtype", "method", "application"] diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 113ad3e338952..b1091ea7f60e4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -26,3 +26,28 @@ jobs: parameters: name: Windows vmImage: vs2017-win2016 + +- job: py37_32bit + pool: + vmImage: ubuntu-18.04 + + steps: + - script: | + docker pull quay.io/pypa/manylinux2014_i686 + docker run -v $(pwd):/pandas quay.io/pypa/manylinux2014_i686 \ + /bin/bash -xc "cd pandas && \ + /opt/python/cp37-cp37m/bin/python -m venv ~/virtualenvs/pandas-dev && \ + . ~/virtualenvs/pandas-dev/bin/activate && \ + python -m pip install --no-deps -U pip wheel setuptools && \ + pip install cython numpy python-dateutil pytz pytest pytest-xdist hypothesis pytest-azurepipelines && \ + python setup.py build_ext -q -i -j2 && \ + python -m pip install --no-build-isolation -e . && \ + pytest -m 'not slow and not network and not clipboard' pandas --junitxml=test-data.xml" + displayName: 'Run 32-bit manylinux2014 Docker Build / Tests' + + - task: PublishTestResults@2 + condition: succeededOrFailed() + inputs: + testResultsFiles: '**/test-*.xml' + failTaskOnFailedTests: true + testRunTitle: 'Publish test results for Python 3.7-32 bit full Linux' diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 33c0750c1dc16..b8abc71ca64a2 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 1.1 .. toctree:: :maxdepth: 2 + v1.1.4 v1.1.3 v1.1.2 v1.1.1 diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 2323afbe00e5d..e752eb54d0c15 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -75,4 +75,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v1.1.2..v1.1.3|HEAD +.. contributors:: v1.1.2..v1.1.3 diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst new file mode 100644 index 0000000000000..fb8687b8ba42c --- /dev/null +++ b/doc/source/whatsnew/v1.1.4.rst @@ -0,0 +1,55 @@ +.. _whatsnew_114: + +What's new in 1.1.4 (October 30, 2020) +-------------------------------------- + +These are the changes in pandas 1.1.4. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_114.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed regression in :func:`read_csv` raising a ``ValueError`` when ``names`` was of type ``dict_keys`` (:issue:`36928`) +- Fixed regression in :func:`read_csv` with more than 1M rows and specifying a ``index_col`` argument (:issue:`37094`) +- Fixed regression where attempting to mutate a :class:`DateOffset` object would no longer raise an ``AttributeError`` (:issue:`36940`) +- Fixed regression where :meth:`DataFrame.agg` would fail with :exc:`TypeError` when passed positional arguments to be passed on to the aggregation function (:issue:`36948`). +- Fixed regression in :class:`RollingGroupby` with ``sort=False`` not being respected (:issue:`36889`) +- Fixed regression in :meth:`Series.astype` converting ``None`` to ``"nan"`` when casting to string (:issue:`36904`) +- Fixed regression in :meth:`Series.rank` method failing for read-only data (:issue:`37290`) +- Fixed regression in :class:`RollingGroupby` causing a segmentation fault with Index of dtype object (:issue:`36727`) +- Fixed regression in :meth:`DataFrame.resample(...).apply(...)` raised ``AttributeError`` when input was a :class:`DataFrame` and only a :class:`Series` was evaluated (:issue:`36951`) +- Fixed regression in ``DataFrame.groupby(..).std()`` with nullable integer dtype (:issue:`37415`) +- Fixed regression in :class:`PeriodDtype` comparing both equal and unequal to its string representation (:issue:`37265`) +- Fixed regression where slicing :class:`DatetimeIndex` raised :exc:`AssertionError` on irregular time series with ``pd.NaT`` or on unsorted indices (:issue:`36953` and :issue:`35509`) +- Fixed regression in certain offsets (:meth:`pd.offsets.Day() ` and below) no longer being hashable (:issue:`37267`) +- Fixed regression in :class:`StataReader` which required ``chunksize`` to be manually set when using an iterator to read a dataset (:issue:`37280`) +- Fixed regression in setitem with :meth:`DataFrame.iloc` which raised error when trying to set a value while filtering with a boolean list (:issue:`36741`) +- Fixed regression in setitem with a Series getting aligned before setting the values (:issue:`37427`) +- Fixed regression in :attr:`MultiIndex.is_monotonic_increasing` returning wrong results with ``NaN`` in at least one of the levels (:issue:`37220`) +- Fixed regression in inplace arithmetic operation on a Series not updating the parent DataFrame (:issue:`36373`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_114.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Bug causing ``groupby(...).sum()`` and similar to not preserve metadata (:issue:`29442`) +- Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` raising a ``ValueError`` when the target was read-only (:issue:`37174`) +- Bug in :meth:`GroupBy.fillna` that introduced a performance regression after 1.0.5 (:issue:`36757`) +- Bug in :meth:`DataFrame.info` was raising a ``KeyError`` when the DataFrame has integer column names (:issue:`37245`) +- Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on (:issue:`35792`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_114.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.3..v1.1.4|HEAD diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 0a70afda893cf..564dfc3a2ca5a 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -325,7 +325,7 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): @cython.boundscheck(False) @cython.wraparound(False) -def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1) -> ndarray: +def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray: cdef: Py_ssize_t i, j, xi, yi, N, K ndarray[float64_t, ndim=2] result @@ -799,7 +799,7 @@ ctypedef fused rank_t: @cython.wraparound(False) @cython.boundscheck(False) def rank_1d( - rank_t[:] in_arr, + ndarray[rank_t, ndim=1] in_arr, ties_method="average", bint ascending=True, na_option="keep", @@ -1018,7 +1018,7 @@ def rank_1d( def rank_2d( - rank_t[:, :] in_arr, + ndarray[rank_t, ndim=2] in_arr, int axis=0, ties_method="average", bint ascending=True, diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 0499eabf708af..eebf89d9650e5 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -1,8 +1,16 @@ -from pandas._libs.khash cimport ( - kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t, kh_str_t, uint64_t, - int64_t, float64_t) from numpy cimport ndarray +from pandas._libs.khash cimport ( + float64_t, + int64_t, + kh_float64_t, + kh_int64_t, + kh_pymap_t, + kh_str_t, + kh_uint64_t, + uint64_t, +) + # prototypes for sharing cdef class HashTable: diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 0cc0a6b192df5..558963c38851e 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -208,7 +208,7 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): {{if dtype == 'object'}} def ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values): {{else}} -def ismember_{{dtype}}(const {{c_type}}[:] arr, {{c_type}}[:] values): +def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): {{endif}} """ Return boolean of values in arr on an diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index b5fe73df5d9be..1bb3a158b4b1a 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -1,5 +1,6 @@ from cpython.object cimport PyObject -from numpy cimport int64_t, uint64_t, int32_t, uint32_t, float64_t +from numpy cimport float64_t, int32_t, int64_t, uint32_t, uint64_t + cdef extern from "khash_python.h": ctypedef uint32_t khint_t diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 6bf0aba128e39..7127c57defee3 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -36,6 +36,7 @@ from numpy cimport ( float32_t, float64_t, int64_t, + intp_t, ndarray, uint8_t, uint64_t, @@ -490,7 +491,7 @@ def has_infs_f8(const float64_t[:] arr) -> bool: return False -def maybe_indices_to_slice(ndarray[int64_t] indices, int max_len): +def maybe_indices_to_slice(ndarray[intp_t] indices, int max_len): cdef: Py_ssize_t i, n = len(indices) int k, vstart, vlast, v diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd index 090c5c5173280..e02b84381b62c 100644 --- a/pandas/_libs/missing.pxd +++ b/pandas/_libs/missing.pxd @@ -1,5 +1,6 @@ from numpy cimport ndarray, uint8_t + cpdef bint checknull(object val) cpdef bint checknull_old(object val) cpdef ndarray[uint8_t] isnaobj(ndarray arr) diff --git a/pandas/_libs/tslibs/ccalendar.pxd b/pandas/_libs/tslibs/ccalendar.pxd index 4eb5188b8a04b..388fd0c62b937 100644 --- a/pandas/_libs/tslibs/ccalendar.pxd +++ b/pandas/_libs/tslibs/ccalendar.pxd @@ -1,6 +1,5 @@ from cython cimport Py_ssize_t - -from numpy cimport int64_t, int32_t +from numpy cimport int32_t, int64_t ctypedef (int32_t, int32_t, int32_t) iso_calendar_t diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 73772e5ab4577..31a6862be5fbf 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -1,6 +1,5 @@ from cpython.datetime cimport datetime, tzinfo - -from numpy cimport int64_t, int32_t, ndarray +from numpy cimport int32_t, int64_t, ndarray from pandas._libs.tslibs.np_datetime cimport npy_datetimestruct diff --git a/pandas/_libs/tslibs/nattype.pxd b/pandas/_libs/tslibs/nattype.pxd index 3f7240654d7e8..d38f4518f9bf0 100644 --- a/pandas/_libs/tslibs/nattype.pxd +++ b/pandas/_libs/tslibs/nattype.pxd @@ -1,6 +1,7 @@ from cpython.datetime cimport datetime - from numpy cimport int64_t + + cdef int64_t NPY_NAT cdef bint _nat_scalar_rules[6] diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index eebdcb3ace507..b2524c6bc6c0d 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -1,6 +1,6 @@ from cpython.datetime cimport date, datetime +from numpy cimport int32_t, int64_t -from numpy cimport int64_t, int32_t cdef extern from "numpy/ndarrayobject.h": ctypedef int64_t npy_timedelta diff --git a/pandas/_libs/tslibs/offsets.pxd b/pandas/_libs/tslibs/offsets.pxd index 9a9244db4a565..215c3f849281f 100644 --- a/pandas/_libs/tslibs/offsets.pxd +++ b/pandas/_libs/tslibs/offsets.pxd @@ -1,5 +1,6 @@ from numpy cimport int64_t + cpdef to_offset(object obj) cdef bint is_offset_object(object obj) cdef bint is_tick_object(object obj) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index ac2725fc58aee..54c08da5269f4 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -785,6 +785,11 @@ cdef class Tick(SingleConstructorOffset): def is_anchored(self) -> bool: return False + # This is identical to BaseOffset.__hash__, but has to be redefined here + # for Python 3, because we've redefined __eq__. + def __hash__(self) -> int: + return hash(self._params) + # -------------------------------------------------------------------- # Comparison and Arithmetic Methods @@ -1209,9 +1214,8 @@ class DateOffset(RelativeDeltaOffset, metaclass=OffsetMeta): >>> ts + DateOffset(months=2) Timestamp('2017-03-01 09:10:11') """ - - pass - + def __setattr__(self, name, value): + raise AttributeError("DateOffset objects are immutable.") # -------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/period.pxd b/pandas/_libs/tslibs/period.pxd index 9c0342e239a89..46c6e52cb9156 100644 --- a/pandas/_libs/tslibs/period.pxd +++ b/pandas/_libs/tslibs/period.pxd @@ -2,5 +2,6 @@ from numpy cimport int64_t from .np_datetime cimport npy_datetimestruct + cdef bint is_period_object(object obj) cdef int64_t get_period_ordinal(npy_datetimestruct *dts, int freq) nogil diff --git a/pandas/_libs/tslibs/timedeltas.pxd b/pandas/_libs/tslibs/timedeltas.pxd index 4142861e9ad38..fed1f2d326819 100644 --- a/pandas/_libs/tslibs/timedeltas.pxd +++ b/pandas/_libs/tslibs/timedeltas.pxd @@ -1,6 +1,7 @@ from cpython.datetime cimport timedelta from numpy cimport int64_t + # Exposed for tslib, not intended for outside use. cpdef int64_t delta_to_nanoseconds(delta) except? -1 cdef convert_to_timedelta64(object ts, str unit) diff --git a/pandas/_libs/tslibs/timestamps.pxd b/pandas/_libs/tslibs/timestamps.pxd index 307b6dfc90715..755cf3fc940b8 100644 --- a/pandas/_libs/tslibs/timestamps.pxd +++ b/pandas/_libs/tslibs/timestamps.pxd @@ -1,5 +1,4 @@ from cpython.datetime cimport datetime, tzinfo - from numpy cimport int64_t from pandas._libs.tslibs.base cimport ABCTimestamp diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 136710003d32a..753c881ed505c 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -1,5 +1,6 @@ from cpython.datetime cimport datetime, timedelta, tzinfo + cdef tzinfo utc_pytz cpdef bint is_utc(tzinfo tz) diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index e280609bb17a7..b5e89e5ed3ef4 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -1,6 +1,7 @@ from cpython.object cimport PyTypeObject + cdef extern from *: """ PyObject* char_to_string(const char* data) { @@ -26,7 +27,8 @@ cdef extern from "Python.h": const char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t* length) except NULL -from numpy cimport int64_t, float64_t +from numpy cimport float64_t, int64_t + cdef extern from "numpy/arrayobject.h": PyTypeObject PyFloatingArrType_Type diff --git a/pandas/_libs/util.pxd b/pandas/_libs/util.pxd index 828bccf7d5641..7394605722103 100644 --- a/pandas/_libs/util.pxd +++ b/pandas/_libs/util.pxd @@ -1,8 +1,9 @@ -from pandas._libs.tslibs.util cimport * - cimport numpy as cnp from numpy cimport ndarray +from pandas._libs.tslibs.util cimport * + + cdef extern from "numpy/ndarraytypes.h": void PyArray_CLEARFLAGS(ndarray arr, int flags) nogil diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 67ab3a8548f21..48d4fe65942fe 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -440,7 +440,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: if len(comps) > 1_000_000 and not is_object_dtype(comps): # If the the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan - if np.isnan(values).any(): + if isna(values).any(): f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) else: f = np.in1d diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a87bddef481b5..bdf294a380edc 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -922,7 +922,9 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): dtype = pandas_dtype(dtype) if issubclass(dtype.type, str): - return lib.ensure_string_array(arr.ravel(), skipna=skipna).reshape(arr.shape) + return lib.ensure_string_array( + arr.ravel(), skipna=skipna, convert_na_value=False + ).reshape(arr.shape) elif is_datetime64_dtype(arr): if is_object_dtype(dtype): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 8350e136417b1..404f0b42f0e33 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -894,6 +894,9 @@ def __eq__(self, other: Any) -> bool: return isinstance(other, PeriodDtype) and self.freq == other.freq + def __ne__(self, other: Any) -> bool: + return not self.__eq__(other) + def __setstate__(self, state): # for pickle compat. __getstate__ is defined in the # PandasExtensionDtype superclass and uses the public properties to diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0cbcb0ce3d700..0b2c99ea674c2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2880,6 +2880,10 @@ def __getitem__(self, key): # Do we have a slicer (on rows)? indexer = convert_to_index_sliceable(self, key) if indexer is not None: + if isinstance(indexer, np.ndarray): + indexer = lib.maybe_indices_to_slice( + indexer.astype(np.intp, copy=False), len(self) + ) # either we have a slice or we have a string that can be converted # to a slice for partial-string date indexing return self._slice(indexer, axis=0) @@ -7359,7 +7363,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): result = None try: - result, how = self._aggregate(func, axis=axis, *args, **kwargs) + result, how = self._aggregate(func, axis, *args, **kwargs) except TypeError as err: exc = TypeError( "DataFrame constructor called with " diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 08c988fa05b6a..2e35bb94dfff6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1219,57 +1219,25 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return self.obj._constructor(index=keys) - key_names = self.grouper.names - # GH12824 first_not_none = next(com.not_none(*values), None) if first_not_none is None: - # GH9684. If all values are None, then this will throw an error. - # We'd prefer it return an empty dataframe. + # GH9684 - All values are None, return an empty frame. return self.obj._constructor() elif isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: - if len(self.grouper.groupings) > 1: - key_index = self.grouper.result_index - - else: - ping = self.grouper.groupings[0] - if len(keys) == ping.ngroups: - key_index = ping.group_index - key_index.name = key_names[0] - - key_lookup = Index(keys) - indexer = key_lookup.get_indexer(key_index) - - # reorder the values - values = [values[i] for i in indexer] - - # update due to the potential reorder - first_not_none = next(com.not_none(*values), None) - else: - - key_index = Index(keys, name=key_names[0]) - - # don't use the key indexer - if not self.as_index: - key_index = None + key_index = self.grouper.result_index if self.as_index else None - # make Nones an empty object - if first_not_none is None: - return self.obj._constructor() - elif isinstance(first_not_none, NDFrame): + if isinstance(first_not_none, Series): # this is to silence a DeprecationWarning # TODO: Remove when default dtype of empty Series is object kwargs = first_not_none._construct_axes_dict() - if isinstance(first_not_none, Series): - backup = create_series_with_explicit_dtype( - **kwargs, dtype_if_empty=object - ) - else: - backup = first_not_none._constructor(**kwargs) + backup = create_series_with_explicit_dtype( + **kwargs, dtype_if_empty=object + ) values = [x if (x is not None) else backup for x in values] @@ -1278,7 +1246,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index: if isinstance(v, Series): applied_index = self._selected_obj._get_axis(self.axis) - all_indexed_same = all_indexes_same([x.index for x in values]) + all_indexed_same = all_indexes_same((x.index for x in values)) singular_series = len(values) == 1 and applied_index.nlevels == 1 # GH3596 @@ -1310,7 +1278,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # GH 8467 return self._concat_objects(keys, values, not_indexed_same=True) - if self.axis == 0 and isinstance(v, ABCSeries): # GH6124 if the list of Series have a consistent name, # then propagate that name to the result. index = v.index.copy() @@ -1323,34 +1290,27 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(names) == 1: index.name = list(names)[0] - # normally use vstack as its faster than concat - # and if we have mi-columns - if ( - isinstance(v.index, MultiIndex) - or key_index is None - or isinstance(key_index, MultiIndex) - ): - stacked_values = np.vstack([np.asarray(v) for v in values]) - result = self.obj._constructor( - stacked_values, index=key_index, columns=index - ) - else: - # GH5788 instead of stacking; concat gets the - # dtypes correct - from pandas.core.reshape.concat import concat - - result = concat( - values, - keys=key_index, - names=key_index.names, - axis=self.axis, - ).unstack() - result.columns = index - elif isinstance(v, ABCSeries): + # Combine values + # vstack+constructor is faster than concat and handles MI-columns stacked_values = np.vstack([np.asarray(v) for v in values]) + + if self.axis == 0: + index = key_index + columns = v.index.copy() + if columns.name is None: + # GH6124 - propagate name of Series when it's consistent + names = {v.name for v in values} + if len(names) == 1: + columns.name = list(names)[0] + else: + index = v.index + columns = key_index + stacked_values = stacked_values.T + result = self.obj._constructor( - stacked_values.T, index=v.index, columns=key_index + stacked_values, index=index, columns=columns ) + elif not self.as_index: # We add grouping column below, so create a frame here result = DataFrame( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 11d0c8e42f745..9415ee1b7e969 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -994,9 +994,10 @@ def _agg_general( ): self._set_group_selection() + result = None # try a cython aggregation if we can try: - return self._cython_agg_general( + result = self._cython_agg_general( how=alias, alt=npfunc, numeric_only=numeric_only, min_count=min_count, ) except DataError: @@ -1012,8 +1013,9 @@ def _agg_general( raise # apply a non-cython aggregation - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - return result + if result is None: + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + return result.__finalize__(self.obj, method="groupby") def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 @@ -1125,12 +1127,12 @@ def reset_identity(values): # when the ax has duplicates # so we resort to this # GH 14776, 30667 - if ax.has_duplicates: + if ax.has_duplicates and not result.axes[self.axis].equals(ax): indexer, _ = result.index.get_indexer_non_unique(ax.values) indexer = algorithms.unique1d(indexer) result = result.take(indexer, axis=self.axis) else: - result = result.reindex(ax, axis=self.axis) + result = result.reindex(ax, axis=self.axis, copy=False) elif self.group_keys: @@ -2487,9 +2489,9 @@ def _get_cythonized_result( except TypeError as e: error_msg = str(e) continue + vals = vals.astype(cython_dtype, copy=False) if needs_2d: vals = vals.reshape((-1, 1)) - vals = vals.astype(cython_dtype, copy=False) func = partial(func, vals) func = partial(func, labels) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 272afe7335c6a..c552b587e036e 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -556,8 +556,13 @@ def indices(self): if isinstance(self.grouper, ops.BaseGrouper): return self.grouper.indices - values = Categorical(self.grouper) - return values._reverse_indexer() + # Return a dictionary of {group label: [indices belonging to the group label]} + # respecting whether sort was specified + codes, uniques = algorithms.factorize(self.grouper, sort=self.sort) + return { + category: np.flatnonzero(codes == i) + for i, category in enumerate(Index(uniques)) + } @property def codes(self) -> np.ndarray: diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 4c5a70f4088ee..678753f684141 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -298,15 +298,16 @@ def all_indexes_same(indexes): Parameters ---------- - indexes : list of Index objects + indexes : iterable of Index objects Returns ------- bool True if all indexes contain the same elements, False otherwise. """ - first = indexes[0] - for index in indexes[1:]: + itr = iter(indexes) + first = next(itr) + for index in itr: if not first.equals(index): return False return True diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a18f7bdccd0d0..e4dee2b0a08ce 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5094,7 +5094,9 @@ def get_slice_bound(self, label, side: str_t, kind) -> int: if is_bool_dtype(slc): slc = lib.maybe_booleans_to_slice(slc.view("u1")) else: - slc = lib.maybe_indices_to_slice(slc.astype("i8"), len(self)) + slc = lib.maybe_indices_to_slice( + slc.astype(np.intp, copy=False), len(self) + ) if isinstance(slc, np.ndarray): raise KeyError( f"Cannot get {side} slice bound for non-unique " diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 9b57a25f1b0e9..b30ef37c14b4b 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -15,7 +15,6 @@ from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( - ensure_int64, is_bool_dtype, is_dtype_equal, is_integer, @@ -181,7 +180,7 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) - indices = ensure_int64(indices) + indices = np.asarray(indices, dtype=np.intp) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) if isinstance(maybe_slice, slice): @@ -581,7 +580,9 @@ def delete(self, loc): freq = self.freq else: if is_list_like(loc): - loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) + loc = lib.maybe_indices_to_slice( + np.asarray(loc, dtype=np.intp), len(self) + ) if isinstance(loc, slice) and loc.step in (1, None): if loc.start in (0, None) or loc.stop in (len(self), None): freq = self.freq diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 09504d50bbf40..b9ba823ca1b0b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1454,7 +1454,10 @@ def is_monotonic_increasing(self) -> bool: return if the index is monotonic increasing (only equal or increasing) values. """ - if all(x.is_monotonic for x in self.levels): + if any(-1 in code for code in self.codes): + return False + + if all(level.is_monotonic for level in self.levels): # If each level is sorted, we can operate on the codes directly. GH27495 return libalgos.is_lexsorted( [x.astype("int64", copy=False) for x in self.codes] @@ -2685,7 +2688,7 @@ def get_loc(self, key, method=None): def _maybe_to_slice(loc): """convert integer indexer to boolean mask or slice if possible""" - if not isinstance(loc, np.ndarray) or loc.dtype != "int64": + if not isinstance(loc, np.ndarray) or loc.dtype != np.intp: return loc loc = lib.maybe_indices_to_slice(loc, len(self)) @@ -2732,7 +2735,7 @@ def _maybe_to_slice(loc): stacklevel=10, ) - loc = np.arange(start, stop, dtype="int64") + loc = np.arange(start, stop, dtype=np.intp) for i, k in enumerate(follow_key, len(lead_key)): mask = self.codes[i][loc] == self._get_loc_single_level_index( diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 5a24addf46d93..d21ff6ee17537 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1669,8 +1669,6 @@ def _setitem_with_indexer(self, indexer, value): "length than the value" ) - pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer - def isetter(loc, v): # positional setting on column loc ser = self.obj._ixs(loc, axis=1) @@ -1680,15 +1678,15 @@ def isetter(loc, v): # which means essentially reassign to the columns of a # multi-dim object # GH6149 (null slice), GH10408 (full bounds) - if isinstance(pi, tuple) and all( + if isinstance(plane_indexer, tuple) and all( com.is_null_slice(idx) or com.is_full_slice(idx, len(self.obj)) - for idx in pi + for idx in plane_indexer ): ser = v else: # set the item, possibly having a dtype change ser = ser.copy() - ser._mgr = ser._mgr.setitem(indexer=pi, value=v) + ser._mgr = ser._mgr.setitem(indexer=plane_indexer, value=v) ser._maybe_update_cacher(clear=True) # reset the sliced object if unique diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3f3f0c68cb1ed..c9ac9cb0f140a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -239,8 +239,8 @@ def _rebuild_blknos_and_blklocs(self) -> None: """ Update mgr._blknos / mgr._blklocs. """ - new_blknos = np.empty(self.shape[0], dtype=np.int64) - new_blklocs = np.empty(self.shape[0], dtype=np.int64) + new_blknos = np.empty(self.shape[0], dtype=np.intp) + new_blklocs = np.empty(self.shape[0], dtype=np.intp) new_blknos.fill(-1) new_blklocs.fill(-1) diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py index c60b67fa2f4f6..6a44178e3c704 100644 --- a/pandas/core/ops/methods.py +++ b/pandas/core/ops/methods.py @@ -93,8 +93,19 @@ def _wrap_inplace_method(method): def f(self, other): result = method(self, other) + + if ( + self.ndim == 1 + and result._indexed_same(self) + and result.dtype == self.dtype + ): + # GH#36498 this inplace op can _actually_ be inplace. + self._values[:] = result._values + return self + # Delete cacher self._reset_cacher() + # this makes sure that we are aligned like the input # we are updating inplace so we want to ignore is_copy self._update_inplace( diff --git a/pandas/core/resample.py b/pandas/core/resample.py index bfdfc65723433..0dfbf96947c33 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -365,8 +365,9 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): result = grouped._aggregate_item_by_item(how, *args, **kwargs) else: result = grouped.aggregate(how, *args, **kwargs) - except DataError: + except (DataError, AttributeError, KeyError): # we have a non-reducing function; try to evaluate + # alternatively we want to evaluate only a column of the input result = grouped.apply(how, *args, **kwargs) except ValueError as err: if "Must produce aggregated value" in str(err): diff --git a/pandas/core/series.py b/pandas/core/series.py index b6ff7b33d27cb..18a201674db65 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1037,10 +1037,8 @@ def _set_with_engine(self, key, value): def _set_with(self, key, value): # other: fancy integer or otherwise if isinstance(key, slice): - # extract_array so that if we set e.g. ser[-5:] = ser[:5] - # we get the first five values, and not 5 NaNs indexer = self.index._convert_slice_indexer(key, kind="getitem") - self.iloc[indexer] = extract_array(value, extract_numpy=True) + return self._set_values(indexer, value) else: assert not isinstance(key, tuple) @@ -1058,12 +1056,26 @@ def _set_with(self, key, value): # should be caught by the is_bool_indexer check in __setitem__ if key_type == "integer": if not self.index._should_fallback_to_positional(): - self.loc[key] = value + self._set_labels(key, value) else: - self.iloc[key] = value + self._set_values(key, value) else: self.loc[key] = value + def _set_labels(self, key, value): + key = com.asarray_tuplesafe(key) + indexer: np.ndarray = self.index.get_indexer(key) + mask = indexer == -1 + if mask.any(): + raise KeyError(f"{key[mask]} not in index") + self._set_values(indexer, value) + + def _set_values(self, key, value): + if isinstance(key, Series): + key = key._values + self._mgr = self._mgr.setitem(indexer=key, value=value) + self._maybe_update_cacher() + def _set_value(self, label, value, takeable: bool = False): """ Quickly set single value at passed label. diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 7cbe34cdebf9f..7c76a8e2a0b22 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -7,6 +7,8 @@ from pandas._libs.window.indexers import calculate_variable_window_bounds from pandas.util._decorators import Appender +from pandas.core.dtypes.common import ensure_platform_int + from pandas.tseries.offsets import Nano get_window_bounds_doc = """ @@ -296,9 +298,9 @@ def get_window_bounds( start_arrays = [] end_arrays = [] window_indicies_start = 0 - for key, indicies in self.groupby_indicies.items(): + for key, indices in self.groupby_indicies.items(): if self.index_array is not None: - index_array = self.index_array.take(indicies) + index_array = self.index_array.take(ensure_platform_int(indices)) else: index_array = self.index_array indexer = self.rolling_indexer( @@ -307,22 +309,22 @@ def get_window_bounds( **self.indexer_kwargs, ) start, end = indexer.get_window_bounds( - len(indicies), min_periods, center, closed + len(indices), min_periods, center, closed ) start = start.astype(np.int64) end = end.astype(np.int64) # Cannot use groupby_indicies as they might not be monotonic with the object # we're rolling over window_indicies = np.arange( - window_indicies_start, window_indicies_start + len(indicies), + window_indicies_start, window_indicies_start + len(indices), ) - window_indicies_start += len(indicies) + window_indicies_start += len(indices) # Extend as we'll be slicing window like [start, end) window_indicies = np.append( window_indicies, [window_indicies[-1] + 1] ).astype(np.int64) - start_arrays.append(window_indicies.take(start)) - end_arrays.append(window_indicies.take(end)) + start_arrays.append(window_indicies.take(ensure_platform_int(start))) + end_arrays.append(window_indicies.take(ensure_platform_int(end))) start = np.concatenate(start_arrays) end = np.concatenate(end_arrays) # GH 35552: Need to adjust start and end based on the nans appended to values diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f7bcd1e795fd3..617c43e0a59ed 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -409,7 +409,7 @@ def _wrap_results(self, results, blocks, obj, exclude=None) -> FrameOrSeries: if self.on is not None and not self._on.equals(obj.index): name = self._on.name - final.append(Series(self._on, index=obj.index, name=name)) + final.append(Series(self._on, index=self.obj.index, name=name)) if self._selection is not None: @@ -2259,7 +2259,7 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: """ rolling_indexer: Type[BaseIndexer] indexer_kwargs: Optional[Dict] = None - index_array = self.obj.index.asi8 + index_array = self._on.asi8 if isinstance(self.window, BaseIndexer): rolling_indexer = type(self.window) indexer_kwargs = self.window.__dict__ diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 7a53b46a4ac0f..db6704f7a96a4 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -332,13 +332,13 @@ def _verbose_repr( ) for i, col in enumerate(ids): - dtype = dtypes[i] + dtype = dtypes.iloc[i] col = pprint_thing(col) line_no = _put_str(f" {i}", space_num) count = "" if show_counts: - count = counts[i] + count = counts.iloc[i] lines.append( line_no diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a02b059967e88..6ce887710ad8f 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -413,7 +413,9 @@ def _validate_names(names): if names is not None: if len(names) != len(set(names)): raise ValueError("Duplicate names are not allowed.") - if not is_list_like(names, allow_sets=False): + if not ( + is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView) + ): raise ValueError("Names should be an ordered collection.") diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7677d8a94d521..8e1c72d4aaa7e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -477,7 +477,7 @@ class PossiblePrecisionLoss(Warning): precision_loss_doc = """ -Column converted from %s to %s, and some data are outside of the lossless +Column converted from {0} to {1}, and some data are outside of the lossless conversion range. This may result in a loss of precision in the saved data. """ @@ -551,7 +551,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: object in a DataFrame. """ ws = "" - # original, if small, if large + # original, if small, if large conversion_data = ( (np.bool_, np.int8, np.int8), (np.uint8, np.int8, np.int16), @@ -571,7 +571,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: dtype = c_data[1] else: dtype = c_data[2] - if c_data[2] == np.float64: # Warn if necessary + if c_data[2] == np.int64: # Warn if necessary if data[col].max() >= 2 ** 53: ws = precision_loss_doc.format("uint64", "float64") @@ -635,12 +635,12 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"): self.value_labels = list(zip(np.arange(len(categories)), categories)) self.value_labels.sort(key=lambda x: x[0]) self.text_len = 0 - self.off: List[int] = [] - self.val: List[int] = [] self.txt: List[bytes] = [] self.n = 0 # Compute lengths and setup lists of offsets and labels + offsets: List[int] = [] + values: List[int] = [] for vl in self.value_labels: category = vl[1] if not isinstance(category, str): @@ -650,9 +650,9 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"): ValueLabelTypeMismatch, ) category = category.encode(encoding) - self.off.append(self.text_len) + offsets.append(self.text_len) self.text_len += len(category) + 1 # +1 for the padding - self.val.append(vl[0]) + values.append(vl[0]) self.txt.append(category) self.n += 1 @@ -663,8 +663,8 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"): ) # Ensure int32 - self.off = np.array(self.off, dtype=np.int32) - self.val = np.array(self.val, dtype=np.int32) + self.off = np.array(offsets, dtype=np.int32) + self.val = np.array(values, dtype=np.int32) # Total length self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len @@ -876,23 +876,23 @@ def __init__(self): # with a label, but the underlying variable is -127 to 100 # we're going to drop the label and cast to int self.DTYPE_MAP = dict( - list(zip(range(1, 245), ["a" + str(i) for i in range(1, 245)])) + list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)])) + [ - (251, np.int8), - (252, np.int16), - (253, np.int32), - (254, np.float32), - (255, np.float64), + (251, np.dtype(np.int8)), + (252, np.dtype(np.int16)), + (253, np.dtype(np.int32)), + (254, np.dtype(np.float32)), + (255, np.dtype(np.float64)), ] ) self.DTYPE_MAP_XML = dict( [ - (32768, np.uint8), # Keys to GSO - (65526, np.float64), - (65527, np.float32), - (65528, np.int32), - (65529, np.int16), - (65530, np.int8), + (32768, np.dtype(np.uint8)), # Keys to GSO + (65526, np.dtype(np.float64)), + (65527, np.dtype(np.float32)), + (65528, np.dtype(np.int32)), + (65529, np.dtype(np.int16)), + (65530, np.dtype(np.int8)), ] ) self.TYPE_MAP = list(range(251)) + list("bhlfd") @@ -1050,9 +1050,10 @@ def __init__( self._order_categoricals = order_categoricals self._encoding = "" self._chunksize = chunksize - if self._chunksize is not None and ( - not isinstance(chunksize, int) or chunksize <= 0 - ): + self._using_iterator = False + if self._chunksize is None: + self._chunksize = 1 + elif not isinstance(chunksize, int) or chunksize <= 0: raise ValueError("chunksize must be a positive integer when set.") # State variables for the file @@ -1062,7 +1063,7 @@ def __init__( self._column_selector_set = False self._value_labels_read = False self._data_read = False - self._dtype = None + self._dtype: Optional[np.dtype] = None self._lines_read = 0 self._native_byteorder = _set_endianness(sys.byteorder) @@ -1195,7 +1196,7 @@ def _read_new_header(self) -> None: # Get data type information, works for versions 117-119. def _get_dtypes( self, seek_vartypes: int - ) -> Tuple[List[Union[int, str]], List[Union[int, np.dtype]]]: + ) -> Tuple[List[Union[int, str]], List[Union[str, np.dtype]]]: self.path_or_buf.seek(seek_vartypes) raw_typlist = [ @@ -1519,11 +1520,8 @@ def _read_strls(self) -> None: self.GSO[str(v_o)] = decoded_va def __next__(self) -> DataFrame: - if self._chunksize is None: - raise ValueError( - "chunksize must be set to a positive integer to use as an iterator." - ) - return self.read(nrows=self._chunksize or 1) + self._using_iterator = True + return self.read(nrows=self._chunksize) def get_chunk(self, size: Optional[int] = None) -> DataFrame: """ @@ -1692,11 +1690,15 @@ def any_startswith(x: str) -> bool: convert = False for col in data: dtype = data[col].dtype - if dtype in (np.float16, np.float32): - dtype = np.float64 + if dtype in (np.dtype(np.float16), np.dtype(np.float32)): + dtype = np.dtype(np.float64) convert = True - elif dtype in (np.int8, np.int16, np.int32): - dtype = np.int64 + elif dtype in ( + np.dtype(np.int8), + np.dtype(np.int16), + np.dtype(np.int32), + ): + dtype = np.dtype(np.int64) convert = True retyped_data.append((col, data[col].astype(dtype))) if convert: @@ -1807,14 +1809,14 @@ def _do_convert_categoricals( keys = np.array(list(vl.keys())) column = data[col] key_matches = column.isin(keys) - if self._chunksize is not None and key_matches.all(): - initial_categories = keys + if self._using_iterator and key_matches.all(): + initial_categories: Optional[np.ndarray] = keys # If all categories are in the keys and we are iterating, # use the same keys for all chunks. If some are missing # value labels, then we will fall back to the categories # varying across chunks. else: - if self._chunksize is not None: + if self._using_iterator: # warn is using an iterator warnings.warn( categorical_conversion_warning, CategoricalConversionWarning @@ -2010,7 +2012,7 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype: "ty", "%ty", ]: - return np.float64 # Stata expects doubles for SIFs + return np.dtype(np.float64) # Stata expects doubles for SIFs else: raise NotImplementedError(f"Format {fmt} not implemented") diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index a58dc5e5ec74a..f6cd500f911b2 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -991,3 +991,10 @@ def test_is_dtype_no_warning(check): with tm.assert_produces_warning(None): check(data["A"]) + + +def test_period_dtype_compare_to_string(): + # https://github.com/pandas-dev/pandas/issues/37265 + dtype = PeriodDtype(freq="M") + assert (dtype == "period[M]") is True + assert (dtype != "period[M]") is False diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 1657abcc96d76..a89a20fc69ef8 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -1480,6 +1480,34 @@ def test_agg_cython_table_raises(self, df, func, expected, axis): with pytest.raises(expected, match=msg): df.agg(func, axis=axis) + @pytest.mark.parametrize("axis", [0, 1]) + @pytest.mark.parametrize( + "args, kwargs", + [ + ((1, 2, 3), {}), + ((8, 7, 15), {}), + ((1, 2), {}), + ((1,), {"b": 2}), + ((), {"a": 1, "b": 2}), + ((), {"a": 2, "b": 1}), + ((), {"a": 1, "b": 2, "c": 3}), + ], + ) + def test_agg_args_kwargs(self, axis, args, kwargs): + def f(x, a, b, c=3): + return x.sum() + (a + b) / c + + df = pd.DataFrame([[1, 2], [3, 4]]) + + if axis == 0: + expected = pd.Series([5.0, 7.0]) + else: + expected = pd.Series([4.0, 8.0]) + + result = df.agg(f, axis, *args, **kwargs) + + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("num_cols", [2, 3, 5]) def test_frequency_is_original(self, num_cols): # GH 22150 diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index c5945edfd3127..3d19a2567f3a5 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -184,3 +184,12 @@ def test_setitem_extension_types(self, obj, dtype): df["obj"] = obj tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("klass", [list, np.array]) + def test_iloc_setitem_bool_indexer(self, klass): + # GH: 36741 + df = DataFrame({"flag": ["x", "y", "z"], "value": [1, 3, 4]}) + indexer = klass([True, False, False]) + df.iloc[indexer, 1] = df.iloc[indexer, 1] * 2 + expected = DataFrame({"flag": ["x", "y", "z"], "value": [2, 3, 4]}) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index 35d45bd00131b..29a3a0106c56c 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -204,3 +204,12 @@ def test_isin_category_frame(self, values): result = df.isin(values) tm.assert_frame_equal(result, expected) + + def test_isin_read_only(self): + # https://github.com/pandas-dev/pandas/issues/37174 + arr = np.array([1, 2, 3]) + arr.setflags(write=False) + df = DataFrame([1, 2, 3]) + result = df.isin(arr) + expected = DataFrame([True, True, True]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 166f26f668502..d9c7585d55a1b 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1566,3 +1566,16 @@ def test_arith_reindex_with_duplicates(): result = df1 + df2 expected = pd.DataFrame([[np.nan, 0, 0]], columns=["first", "second", "second"]) tm.assert_frame_equal(result, expected) + + +def test_inplace_arithmetic_series_update(): + # https://github.com/pandas-dev/pandas/issues/36373 + df = DataFrame({"A": [1, 2, 3]}) + series = df["A"] + vals = series._values + + series += 1 + assert series._values is vals + + expected = DataFrame({"A": [2, 3, 4]}) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 4d0f1a326225d..ad991f92ee99f 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -770,21 +770,31 @@ def test_categorical_accessor(method): # Groupby +@pytest.mark.parametrize( + "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] +) +@pytest.mark.parametrize( + "method", [operator.methodcaller("sum"), lambda x: x.agg("sum")], +) +def test_groupby_finalize(obj, method): + obj.attrs = {"a": 1} + result = method(obj.groupby([0, 0])) + assert result.attrs == {"a": 1} + + @pytest.mark.parametrize( "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] ) @pytest.mark.parametrize( "method", [ - operator.methodcaller("sum"), - lambda x: x.agg("sum"), lambda x: x.agg(["sum", "count"]), lambda x: x.transform(lambda y: y), lambda x: x.apply(lambda y: y), ], ) @not_implemented_mark -def test_groupby(obj, method): +def test_groupby_finalize_not_implemented(obj, method): obj.attrs = {"a": 1} result = method(obj.groupby([0, 0])) assert result.attrs == {"a": 1} diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 87ebd8b5a27fb..7bacb62ce62f4 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -277,3 +277,27 @@ def test_read_only_buffer_source_agg(agg): expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "op_name", + ["count", "sum", "std", "var", "sem", "mean", "median", "prod", "min", "max"], +) +def test_cython_agg_nullable_int(op_name): + # ensure that the cython-based aggregations don't fail for nullable dtype + # (eg https://github.com/pandas-dev/pandas/issues/37415) + df = DataFrame( + { + "A": ["A", "B"] * 5, + "B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"), + } + ) + result = getattr(df.groupby("A")["B"], op_name)() + df2 = df.assign(B=df["B"].astype("float64")) + expected = getattr(df2.groupby("A")["B"], op_name)() + + if op_name != "count": + # the result is not yet consistently using Int64/Float64 dtype, + # so for now just checking the values by casting to float + result = result.astype("float64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5a1268bfb03db..2af495a170bee 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -868,13 +868,14 @@ def test_apply_multi_level_name(category): b = [1, 2] * 5 if category: b = pd.Categorical(b, categories=[1, 2, 3]) + expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B") + else: + expected_index = pd.Index([1, 2], name="B") df = pd.DataFrame( {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))} ).set_index(["A", "B"]) result = df.groupby("B").apply(lambda x: x.sum()) - expected = pd.DataFrame( - {"C": [20, 25], "D": [20, 25]}, index=pd.Index([1, 2], name="B") - ) + expected = pd.DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index) tm.assert_frame_equal(result, expected) assert df.index.names == ["A", "B"] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ebce5b0ef0a66..bdb283ae445b1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1255,8 +1255,8 @@ def test_groupby_nat_exclude(): assert grouped.ngroups == 2 expected = { - Timestamp("2013-01-01 00:00:00"): np.array([1, 7], dtype=np.int64), - Timestamp("2013-02-01 00:00:00"): np.array([3, 5], dtype=np.int64), + Timestamp("2013-01-01 00:00:00"): np.array([1, 7], dtype=np.intp), + Timestamp("2013-02-01 00:00:00"): np.array([3, 5], dtype=np.intp), } for k in grouped.indices: diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 5f82203d92dc3..1cb539f6010fd 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -5,7 +5,6 @@ import pytest from pandas._libs import iNaT -from pandas.compat.numpy import _is_numpy_dev from pandas.errors import InvalidIndexError from pandas.core.dtypes.common import is_datetime64tz_dtype @@ -418,7 +417,7 @@ def test_set_ops_error_cases(self, case, method, index): with pytest.raises(TypeError, match=msg): getattr(index, method)(case) - def test_intersection_base(self, index, request): + def test_intersection_base(self, index): if isinstance(index, CategoricalIndex): return @@ -435,15 +434,6 @@ def test_intersection_base(self, index, request): # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - # https://github.com/pandas-dev/pandas/issues/35481 - if ( - _is_numpy_dev - and isinstance(case, Series) - and isinstance(index, UInt64Index) - ): - mark = pytest.mark.xfail(reason="gh-35481") - request.node.add_marker(mark) - result = first.intersection(case) assert tm.equalContents(result, second) diff --git a/pandas/tests/indexes/multi/test_monotonic.py b/pandas/tests/indexes/multi/test_monotonic.py index ca1cb0932f63d..8659573d8123a 100644 --- a/pandas/tests/indexes/multi/test_monotonic.py +++ b/pandas/tests/indexes/multi/test_monotonic.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas as pd from pandas import Index, MultiIndex @@ -174,3 +175,14 @@ def test_is_strictly_monotonic_decreasing(): ) assert idx.is_monotonic_decreasing is True assert idx._is_strictly_monotonic_decreasing is False + + +@pytest.mark.parametrize("attr", ["is_monotonic_increasing", "is_monotonic_decreasing"]) +@pytest.mark.parametrize( + "values", + [[(np.nan,), (1,), (2,)], [(1,), (np.nan,), (2,)], [(1,), (2,), (np.nan,)]], +) +def test_is_monotonic_with_nans(values, attr): + # GH: 37220 + idx = pd.MultiIndex.from_tuples(values, names=["test"]) + assert getattr(idx, attr) is False diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 193800fae751f..b56a92ce71605 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -896,6 +896,7 @@ def test_identity_slice_returns_new_object(self): original_series[:3] = [7, 8, 9] assert all(sliced_series[:3] == [7, 8, 9]) + @pytest.mark.xfail(reason="accidental fix reverted - GH37497") def test_loc_copy_vs_view(self): # GH 15631 x = DataFrame(zip(range(3), range(3)), columns=["a", "b"]) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 337ec683ee745..15fb616528a21 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -681,3 +681,24 @@ def test_index_name_empty(self): {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="series_index") ) tm.assert_frame_equal(df, expected) + + def test_slice_irregular_datetime_index_with_nan(self): + # GH36953 + index = pd.to_datetime(["2012-01-01", "2012-01-02", "2012-01-03", None]) + df = DataFrame(range(len(index)), index=index) + expected = DataFrame(range(len(index[:3])), index=index[:3]) + result = df["2012-01-01":"2012-01-04"] + tm.assert_frame_equal(result, expected) + + def test_slice_datetime_index(self): + # GH35509 + df = DataFrame( + {"col1": ["a", "b", "c"], "col2": [1, 2, 3]}, + index=pd.to_datetime(["2020-08-01", "2020-07-02", "2020-08-05"]), + ) + expected = DataFrame( + {"col1": ["a", "c"], "col2": [1, 3]}, + index=pd.to_datetime(["2020-08-01", "2020-08-05"]), + ) + result = df.loc["2020-08"] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 877bd1650ae60..6d2d9be97d7fa 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -7,7 +7,7 @@ import numpy as np import pytest -from pandas.compat import PYPY +from pandas.compat import IS64, PYPY from pandas import ( CategoricalIndex, @@ -403,3 +403,26 @@ def test_info_categorical(): buf = StringIO() df.info(buf=buf) + + +@pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") +def test_info_int_columns(): + # GH#37245 + df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"]) + buf = StringIO() + df.info(null_counts=True, buf=buf) + result = buf.getvalue() + expected = textwrap.dedent( + """\ + + Index: 2 entries, A to B + Data columns (total 2 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 1 2 non-null int64 + 1 2 2 non-null int64 + dtypes: int64(2) + memory usage: 48.0+ bytes + """ + ) + assert result == expected diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index c6a43d22ca155..9434ad5fe8761 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2212,3 +2212,15 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers): ) with pytest.raises(ValueError, match=msg): parser.read_table(f, delim_whitespace=True, sep=",") + + +def test_dict_keys_as_names(all_parsers): + # GH: 36928 + data = "1,2" + + keys = {"a": int, "b": int}.keys() + parser = all_parsers + + result = parser.read_csv(StringIO(data), names=keys) + expected = DataFrame({"a": [1], "b": [2]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 9f425168540ba..33e24c55d44d9 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -207,3 +207,18 @@ def test_header_with_index_col(all_parsers): result = parser.read_csv(StringIO(data), index_col="I11", header=0) tm.assert_frame_equal(result, expected) + + +@pytest.mark.slow +def test_index_col_large_csv(all_parsers): + # https://github.com/pandas-dev/pandas/issues/37094 + parser = all_parsers + + N = 1_000_001 + df = DataFrame({"a": range(N), "b": np.random.randn(N)}) + + with tm.ensure_clean() as path: + df.to_csv(path, index=False) + result = parser.read_csv(path, index_col=[0]) + + tm.assert_frame_equal(result, df.set_index("a")) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 306b2a7849586..6df13278fcb75 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -563,16 +563,20 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): # read_table uses the new Arrow Datasets API since pyarrow 1.0.0 # Previous behaviour was pyarrow partitioned columns become 'category' dtypes # These are added to back of dataframe on read. In new API category dtype is - # only used if partition field is string. - legacy_read_table = LooseVersion(pyarrow.__version__) < LooseVersion("1.0.0") - if partition_col and legacy_read_table: - partition_col_type = "category" - else: - partition_col_type = "int32" - - expected_df[partition_col] = expected_df[partition_col].astype( - partition_col_type + # only used if partition field is string, but this changed again to use + # category dtype for all types (not only strings) in pyarrow 2.0.0 + pa10 = (LooseVersion(pyarrow.__version__) >= LooseVersion("1.0.0")) and ( + LooseVersion(pyarrow.__version__) < LooseVersion("2.0.0") ) + if partition_col: + if pa10: + partition_col_type = "int32" + else: + partition_col_type = "category" + + expected_df[partition_col] = expected_df[partition_col].astype( + partition_col_type + ) check_round_trip( df_compat, diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 6d7fec803a8e0..9788602242128 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1966,9 +1966,6 @@ def test_iterator_errors(dirpath): StataReader(dta_file, chunksize=0) with pytest.raises(ValueError, match="chunksize must be a positive"): StataReader(dta_file, chunksize="apple") - with pytest.raises(ValueError, match="chunksize must be set to a positive"): - with StataReader(dta_file) as reader: - reader.__next__() def test_iterator_value_labels(): @@ -1983,3 +1980,18 @@ def test_iterator_value_labels(): for i in range(2): tm.assert_index_equal(chunk.dtypes[i].categories, expected) tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100]) + + +def test_precision_loss(): + df = DataFrame( + [[sum(2 ** i for i in range(60)), sum(2 ** i for i in range(52))]], + columns=["big", "little"], + ) + with tm.ensure_clean() as path: + with tm.assert_produces_warning(PossiblePrecisionLoss): + df.to_stata(path, write_index=False) + reread = read_stata(path) + expected_dt = Series([np.float64, np.float64], index=["big", "little"]) + tm.assert_series_equal(reread.dtypes, expected_dt) + assert reread.loc[0, "little"] == df.loc[0, "little"] + assert reread.loc[0, "big"] == float(df.loc[0, "big"]) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index b36b11582c1ec..6ca965630248f 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -6,7 +6,7 @@ from pandas.util._test_decorators import async_mark import pandas as pd -from pandas import DataFrame, Series, Timestamp, compat +from pandas import DataFrame, Series, Timestamp import pandas._testing as tm from pandas.core.indexes.datetimes import date_range @@ -317,7 +317,6 @@ def test_resample_groupby_with_label(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(not compat.IS64, reason="GH-35148") def test_consistency_with_window(): # consistent return values with window @@ -346,3 +345,18 @@ def test_median_duplicate_columns(): result = df.resample("5s").median() expected.columns = result.columns tm.assert_frame_equal(result, expected) + + +def test_apply_to_one_column_of_df(): + # GH: 36951 + df = pd.DataFrame( + {"col": range(10), "col1": range(10, 20)}, + index=pd.date_range("2012-01-01", periods=10, freq="20min"), + ) + result = df.resample("H").apply(lambda group: group.col.sum()) + expected = pd.Series( + [3, 12, 21, 9], index=pd.date_range("2012-01-01", periods=4, freq="H") + ) + tm.assert_series_equal(result, expected) + result = df.resample("H").apply(lambda group: group["col"].sum()) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 6b7cda89a4714..cf03dfb8ca9b7 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -137,3 +137,13 @@ def test_getitem_ndim_deprecated(): s = pd.Series([0, 1]) with tm.assert_produces_warning(FutureWarning): s[:, None] + + +def test_getitem_assignment_series_aligment(): + # https://github.com/pandas-dev/pandas/issues/37427 + # with getitem, when assigning with a Series, it is not first aligned + s = Series(range(10)) + idx = np.array([2, 4, 9]) + s[idx] = Series([10, 11, 12]) + expected = Series([0, 1, 10, 3, 11, 5, 6, 7, 8, 12]) + tm.assert_series_equal(s, expected) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 7449d8d65ef96..ad960d4c65268 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import Interval, Series, Timestamp, date_range +from pandas import NA, Interval, Series, Timestamp, date_range import pandas._testing as tm @@ -55,3 +55,13 @@ def test_astype_from_float_to_str(self, dtype): result = s.astype(str) expected = Series(["0.1"]) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "value, string_value", [(None, "None"), (np.nan, "nan"), (NA, "")], + ) + def test_astype_to_str_preserves_na(self, value, string_value): + # https://github.com/pandas-dev/pandas/issues/36904 + s = Series(["a", "b", value], dtype=object) + result = s.astype(str) + expected = Series(["a", "b", string_value], dtype=object) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 3836c1d56bf87..86ea2b2f02a4d 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -80,3 +80,22 @@ def test_isin_empty(self, empty): result = s.isin(empty) tm.assert_series_equal(expected, result) + + def test_isin_read_only(self): + # https://github.com/pandas-dev/pandas/issues/37174 + arr = np.array([1, 2, 3]) + arr.setflags(write=False) + s = Series([1, 2, 3]) + result = s.isin(arr) + expected = Series([True, True, True]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.slow +def test_isin_large_series_mixed_dtypes_and_nan(): + # https://github.com/pandas-dev/pandas/issues/37094 + # combination of object dtype for the values and > 1_000_000 elements + ser = Series([1, 2, np.nan] * 1_000_000) + result = ser.isin({"foo", "bar"}) + expected = Series([False] * 3 * 1_000_000) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a8a55418a619a..a78f8ad3cd4dd 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1751,11 +1751,13 @@ def _check(arr): _check(np.array([np.nan, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 3, np.nan])) _check(np.array([4.0, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 4.0, np.nan])) - def test_basic(self): + def test_basic(self, writable): exp = np.array([1, 2], dtype=np.float64) for dtype in np.typecodes["AllInteger"]: - s = Series([1, 100], dtype=dtype) + data = np.array([1, 100], dtype=dtype) + data.setflags(write=writable) + s = Series(data) tm.assert_numpy_array_equal(algos.rank(s), exp) def test_uint64_overflow(self): diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index b32c5e91af295..4113ff6bb27d3 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -150,6 +150,18 @@ def test_missing_required_dependency(): # https://github.com/MacPython/pandas-wheels/pull/50 pyexe = sys.executable.replace("\\", "/") + + # We skip this test if pandas is installed as a site package. We first + # import the package normally and check the path to the module before + # executing the test which imports pandas with site packages disabled. + call = [pyexe, "-c", "import pandas;print(pandas.__file__)"] + output = subprocess.check_output(call).decode() + if "site-packages" in output: + pytest.skip("pandas installed as site package") + + # This test will fail if pandas is installed as a site package. The flags + # prevent pandas being imported and the test will report Failed: DID NOT + # RAISE call = [pyexe, "-sSE", "-c", "import pandas"] msg = ( diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index b6f59807eaa15..60cbe0d94e734 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -51,7 +51,7 @@ def test_maybe_indices_to_slice_left_edge(self): target = np.arange(100) # slice - indices = np.array([], dtype=np.int64) + indices = np.array([], dtype=np.intp) maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) @@ -59,7 +59,7 @@ def test_maybe_indices_to_slice_left_edge(self): for end in [1, 2, 5, 20, 99]: for step in [1, 2, 4]: - indices = np.arange(0, end, step, dtype=np.int64) + indices = np.arange(0, end, step, dtype=np.intp) maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) @@ -74,7 +74,7 @@ def test_maybe_indices_to_slice_left_edge(self): # not slice for case in [[2, 1, 2, 0], [2, 2, 1, 0], [0, 1, 2, 1], [-2, 0, 2], [2, 0, -2]]: - indices = np.array(case, dtype=np.int64) + indices = np.array(case, dtype=np.intp) maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert not isinstance(maybe_slice, slice) @@ -87,7 +87,7 @@ def test_maybe_indices_to_slice_right_edge(self): # slice for start in [0, 2, 5, 20, 97, 98]: for step in [1, 2, 4]: - indices = np.arange(start, 99, step, dtype=np.int64) + indices = np.arange(start, 99, step, dtype=np.intp) maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) @@ -101,7 +101,7 @@ def test_maybe_indices_to_slice_right_edge(self): tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) # not slice - indices = np.array([97, 98, 99, 100], dtype=np.int64) + indices = np.array([97, 98, 99, 100], dtype=np.intp) maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert not isinstance(maybe_slice, slice) @@ -114,7 +114,7 @@ def test_maybe_indices_to_slice_right_edge(self): with pytest.raises(IndexError, match=msg): target[maybe_slice] - indices = np.array([100, 99, 98, 97], dtype=np.int64) + indices = np.array([100, 99, 98, 97], dtype=np.intp) maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert not isinstance(maybe_slice, slice) @@ -126,7 +126,7 @@ def test_maybe_indices_to_slice_right_edge(self): target[maybe_slice] for case in [[99, 97, 99, 96], [99, 99, 98, 97], [98, 98, 97, 96]]: - indices = np.array(case, dtype=np.int64) + indices = np.array(case, dtype=np.intp) maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert not isinstance(maybe_slice, slice) @@ -138,7 +138,7 @@ def test_maybe_indices_to_slice_both_edges(self): # slice for step in [1, 2, 4, 5, 8, 9]: - indices = np.arange(0, 9, step, dtype=np.int64) + indices = np.arange(0, 9, step, dtype=np.intp) maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) @@ -151,7 +151,7 @@ def test_maybe_indices_to_slice_both_edges(self): # not slice for case in [[4, 2, 0, -2], [2, 2, 1, 0], [0, 1, 2, 1]]: - indices = np.array(case, dtype=np.int64) + indices = np.array(case, dtype=np.intp) maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert not isinstance(maybe_slice, slice) tm.assert_numpy_array_equal(maybe_slice, indices) @@ -163,7 +163,7 @@ def test_maybe_indices_to_slice_middle(self): # slice for start, end in [(2, 10), (5, 25), (65, 97)]: for step in [1, 2, 4, 20]: - indices = np.arange(start, end, step, dtype=np.int64) + indices = np.arange(start, end, step, dtype=np.intp) maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) @@ -178,7 +178,7 @@ def test_maybe_indices_to_slice_middle(self): # not slice for case in [[14, 12, 10, 12], [12, 12, 11, 10], [10, 11, 12, 11]]: - indices = np.array(case, dtype=np.int64) + indices = np.array(case, dtype=np.intp) maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert not isinstance(maybe_slice, slice) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 8c51908c547f4..a720d813dbd8f 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -685,6 +685,11 @@ def test_isAnchored_deprecated(self, offset_types): expected = off.is_anchored() assert result == expected + def test_offsets_hashable(self, offset_types): + # GH: 37267 + off = self._get_offset(offset_types) + assert hash(off) is not None + class TestDateOffset(Base): def setup_method(self, method): @@ -4438,3 +4443,13 @@ def test_week_add_invalid(): other = Day() with pytest.raises(TypeError, match="Cannot add"): offset + other + + +@pytest.mark.parametrize( + "attribute", ["hours", "days", "weeks", "months", "years"], +) +def test_dateoffset_immutable(attribute): + offset = DateOffset(**{attribute: 0}) + msg = "DateOffset objects are immutable" + with pytest.raises(AttributeError, match=msg): + setattr(offset, attribute, 5) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 28e27791cad35..2c3d8b4608806 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -6,7 +6,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Index, Series, Timestamp, compat, concat +from pandas import DataFrame, Index, Series, Timestamp, concat import pandas._testing as tm from pandas.core.base import SpecificationError @@ -277,7 +277,7 @@ def test_preserve_metadata(): @pytest.mark.parametrize( "func,window_size,expected_vals", [ - pytest.param( + ( "rolling", 2, [ @@ -289,7 +289,6 @@ def test_preserve_metadata(): [35.0, 40.0, 60.0, 40.0], [60.0, 80.0, 85.0, 80], ], - marks=pytest.mark.xfail(not compat.IS64, reason="GH-35294"), ), ( "expanding", diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py index 2aaf6af103e98..bc38634da8941 100644 --- a/pandas/tests/window/test_apply.py +++ b/pandas/tests/window/test_apply.py @@ -4,7 +4,7 @@ from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td -from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, compat, date_range +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range import pandas._testing as tm @@ -142,7 +142,6 @@ def test_invalid_kwargs_nopython(): @pytest.mark.parametrize("args_kwargs", [[None, {"par": 10}], [(10,), None]]) -@pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_apply_args_kwargs(args_kwargs): # GH 33433 def foo(x, par): diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 806c22c60b48f..493a844ca7a44 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Series, compat +from pandas import DataFrame, Series import pandas._testing as tm from pandas.core.groupby.groupby import get_groupby @@ -23,7 +23,6 @@ def test_mutated(self): g = get_groupby(self.frame, by="A", mutated=True) assert g.mutated - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_getitem(self): g = self.frame.groupby("A") g_mutated = get_groupby(self.frame, by="A", mutated=True) @@ -56,7 +55,6 @@ def test_getitem_multiple(self): result = r.B.count() tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling(self): g = self.frame.groupby("A") r = g.rolling(window=4) @@ -74,7 +72,6 @@ def test_rolling(self): @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] ) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_quantile(self, interpolation): g = self.frame.groupby("A") r = g.rolling(window=4) @@ -105,7 +102,6 @@ def func(x): expected = g.apply(func) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_apply(self, raw): g = self.frame.groupby("A") r = g.rolling(window=4) @@ -115,7 +111,6 @@ def test_rolling_apply(self, raw): expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_apply_mutability(self): # GH 14013 df = pd.DataFrame({"A": ["foo"] * 3 + ["bar"] * 3, "B": [1] * 6}) @@ -197,7 +192,6 @@ def test_expanding_apply(self, raw): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("expected_value,raw_value", [[1.0, True], [0.0, False]]) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling(self, expected_value, raw_value): # GH 31754 @@ -215,7 +209,6 @@ def foo(x): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling_center_center(self): # GH 35552 series = Series(range(1, 6)) @@ -281,7 +274,6 @@ def test_groupby_rolling_center_center(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_subselect_rolling(self): # GH 35486 df = DataFrame( @@ -307,7 +299,6 @@ def test_groupby_subselect_rolling(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling_custom_indexer(self): # GH 35557 class SimpleIndexer(pd.api.indexers.BaseIndexer): @@ -331,7 +322,6 @@ def get_window_bounds( expected = df.groupby(df.index).rolling(window=3, min_periods=1).sum() tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling_subset_with_closed(self): # GH 35549 df = pd.DataFrame( @@ -356,7 +346,6 @@ def test_groupby_rolling_subset_with_closed(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_subset_rolling_subset_with_closed(self): # GH 35549 df = pd.DataFrame( @@ -384,7 +373,6 @@ def test_groupby_subset_rolling_subset_with_closed(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") @pytest.mark.parametrize("func", ["max", "min"]) def test_groupby_rolling_index_changed(self, func): # GH: #36018 nlevels of MultiIndex changed @@ -417,3 +405,47 @@ def test_groupby_rolling_empty_frame(self): result = expected.groupby(["s1", "s2"]).rolling(window=1).sum() expected.index = pd.MultiIndex.from_tuples([], names=["s1", "s2", None]) tm.assert_frame_equal(result, expected) + + def test_groupby_rolling_string_index(self): + # GH: 36727 + df = pd.DataFrame( + [ + ["A", "group_1", pd.Timestamp(2019, 1, 1, 9)], + ["B", "group_1", pd.Timestamp(2019, 1, 2, 9)], + ["Z", "group_2", pd.Timestamp(2019, 1, 3, 9)], + ["H", "group_1", pd.Timestamp(2019, 1, 6, 9)], + ["E", "group_2", pd.Timestamp(2019, 1, 20, 9)], + ], + columns=["index", "group", "eventTime"], + ).set_index("index") + + groups = df.groupby("group") + df["count_to_date"] = groups.cumcount() + rolling_groups = groups.rolling("10d", on="eventTime") + result = rolling_groups.apply(lambda df: df.shape[0]) + expected = pd.DataFrame( + [ + ["A", "group_1", pd.Timestamp(2019, 1, 1, 9), 1.0], + ["B", "group_1", pd.Timestamp(2019, 1, 2, 9), 2.0], + ["H", "group_1", pd.Timestamp(2019, 1, 6, 9), 3.0], + ["Z", "group_2", pd.Timestamp(2019, 1, 3, 9), 1.0], + ["E", "group_2", pd.Timestamp(2019, 1, 20, 9), 1.0], + ], + columns=["index", "group", "eventTime", "count_to_date"], + ).set_index(["group", "index"]) + tm.assert_frame_equal(result, expected) + + def test_groupby_rolling_no_sort(self): + # GH 36889 + result = ( + pd.DataFrame({"foo": [2, 1], "bar": [2, 1]}) + .groupby("foo", sort=False) + .rolling(1) + .min() + ) + expected = pd.DataFrame( + np.array([[2.0, 2.0], [1.0, 1.0]]), + columns=["foo", "bar"], + index=pd.MultiIndex.from_tuples([(2, 0), (1, 1)], names=["foo", None]), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index bea239a245a4f..8d72e2cb92ca9 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -7,7 +7,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Series, compat, date_range +from pandas import DataFrame, Series, date_range import pandas._testing as tm from pandas.core.window import Rolling @@ -150,7 +150,6 @@ def test_closed_one_entry(func): @pytest.mark.parametrize("func", ["min", "max"]) -@pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_closed_one_entry_groupby(func): # GH24718 ser = pd.DataFrame( @@ -683,7 +682,6 @@ def test_iter_rolling_datetime(expected, expected_index, window): ), ], ) -@pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_positional_argument(grouping, _index, raw): # GH 34605 diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 90f919d5565b0..8aa4d7103e48a 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -7,7 +7,6 @@ MultiIndex, Series, Timestamp, - compat, date_range, to_datetime, ) @@ -657,7 +656,6 @@ def agg_by_day(x): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_monotonic(self): # GH 15130 @@ -687,7 +685,6 @@ def test_groupby_monotonic(self): result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_non_monotonic(self): # GH 13966 (similar to #15130, closed by #15175) diff --git a/pyproject.toml b/pyproject.toml index 098a38958b5cc..5b3c3fd598b2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,10 +7,11 @@ requires = [ "Cython>=0.29.21,<3", # Note: sync with setup.py "numpy==1.15.4; python_version=='3.6' and platform_system!='AIX'", "numpy==1.15.4; python_version=='3.7' and platform_system!='AIX'", - "numpy==1.17.3; python_version>='3.8' and platform_system!='AIX'", + "numpy==1.17.3; python_version=='3.8' and platform_system!='AIX'", "numpy==1.16.0; python_version=='3.6' and platform_system=='AIX'", "numpy==1.16.0; python_version=='3.7' and platform_system=='AIX'", - "numpy==1.17.3; python_version>='3.8' and platform_system=='AIX'", + "numpy==1.17.3; python_version=='3.8' and platform_system=='AIX'", + "numpy; python_version>='3.9'", ] [tool.black]