diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index db1fc30111a2d..149acef72db26 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,7 +4,9 @@ on: push: branches: master pull_request: - branches: master + branches: + - master + - 1.1.x env: ENV_FILE: environment.yml diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index dc6f45f810f3d..e0a2257b0ca1f 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -6,7 +6,7 @@ from .pandas_vb_common import tm try: - from pandas.tseries.offsets import Nano, Hour + from pandas.tseries.offsets import Hour, Nano except ImportError: # For compatibility with older versions from pandas.core.datetools import * # noqa diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index e266d871f5bc6..5d9070de92ec7 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -7,14 +7,14 @@ try: from pandas import ( - rolling_median, + rolling_kurt, + rolling_max, rolling_mean, + rolling_median, rolling_min, - rolling_max, - rolling_var, rolling_skew, - rolling_kurt, rolling_std, + rolling_var, ) have_rolling_methods = True diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index ec3eddfff7184..5390056ba36f2 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -2,8 +2,8 @@ try: from pandas._libs.tslibs.parsing import ( - concat_date_cols, _does_string_look_like_datetime, + concat_date_cols, ) except ImportError: # Avoid whole benchmark suite import failure on asv (currently 0.4) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d7fb2775376c0..2023858181baa 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -7,6 +7,21 @@ from .pandas_vb_common import tm +class Construction: + + params = ["str", "string"] + param_names = ["dtype"] + + def setup(self, dtype): + self.data = tm.rands_array(nchars=10 ** 5, size=10) + + def time_construction(self, dtype): + Series(self.data, dtype=dtype) + + def peakmem_construction(self, dtype): + Series(self.data, dtype=dtype) + + class Methods: def setup(self): self.s = Series(tm.makeStringIndex(10 ** 5)) diff --git a/asv_bench/benchmarks/tslibs/normalize.py b/asv_bench/benchmarks/tslibs/normalize.py index 7d4e0556f4d96..9a206410d8775 100644 --- a/asv_bench/benchmarks/tslibs/normalize.py +++ b/asv_bench/benchmarks/tslibs/normalize.py @@ -1,5 +1,5 @@ try: - from pandas._libs.tslibs import normalize_i8_timestamps, is_date_array_normalized + from pandas._libs.tslibs import is_date_array_normalized, normalize_i8_timestamps except ImportError: from pandas._libs.tslibs.conversion import ( normalize_i8_timestamps, diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e45cafc02cb61..113ad3e338952 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,9 +1,11 @@ # Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml trigger: - master +- 1.1.x pr: - master +- 1.1.x variables: PYTEST_WORKERS: auto diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7b12de387d648..69ce0f1adce22 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -121,7 +121,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then # Imports - Check formatting using isort see setup.cfg for settings MSG='Check import format using isort' ; echo $MSG - ISORT_CMD="isort --quiet --recursive --check-only pandas asv_bench scripts" + ISORT_CMD="isort --quiet --check-only pandas asv_bench scripts" if [[ "$GITHUB_ACTIONS" == "true" ]]; then eval $ISORT_CMD | awk '{print "##[error]" $0}'; RET=$(($RET + ${PIPESTATUS[0]})) else diff --git a/ci/deps/azure-36-32bit.yaml b/ci/deps/azure-36-32bit.yaml index 2dc53f8181ac4..15704cf0d5427 100644 --- a/ci/deps/azure-36-32bit.yaml +++ b/ci/deps/azure-36-32bit.yaml @@ -23,4 +23,4 @@ dependencies: - pip - pip: - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index d31015fde4741..3034ed3dc43af 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1,<6.0.0 # https://github.com/pandas-dev/pandas/issues/35620 - pytest-xdist>=1.21 - pytest-asyncio - hypothesis>=3.58.0 diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 23121b985492e..c086b3651afc3 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-36-slow.yaml b/ci/deps/azure-36-slow.yaml index 0a6d1d13c8549..87bad59fa4873 100644 --- a/ci/deps/azure-36-slow.yaml +++ b/ci/deps/azure-36-slow.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 4dbb6a5344976..6f64c81f299d1 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - pytest-asyncio - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 451fb5884a4af..5cb58756a6ac1 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -5,7 +5,7 @@ dependencies: - python=3.7.* # tools - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml index 81a27465f9e61..eeea249a19ca1 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-36.yaml @@ -5,7 +5,7 @@ dependencies: - python=3.6.* # tools - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 4d7e1d821037b..21b4e86918f3b 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -7,8 +7,8 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 + - pytest>=5.0.1 + - pytest-xdist>=1.21,<2.0.0 # GH 35737 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 34fca631df6c1..287d6877b9810 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -7,8 +7,8 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 + - pytest>=5.0.1 + - pytest-xdist>=1.21,<2.0.0 # GH 35737 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index 5f5ea8034cddf..2457c04e67759 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -7,10 +7,10 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - - pytest-cov # this is only needed in the coverage build + - pytest-cov>=2.10.1 # this is only needed in the coverage build, ref: GH 35737 # pandas dependencies - beautifulsoup4 diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 6bc4aba733ee5..03a1e751b6a86 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/travis-37-arm64.yaml index f434a03609b26..5cb53489be225 100644 --- a/ci/deps/travis-37-arm64.yaml +++ b/ci/deps/travis-37-arm64.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.13 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index aaf706d61fe5c..e896233aac63c 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-38.yaml b/ci/deps/travis-38.yaml index ac39a223cd086..b879c0f81dab2 100644 --- a/ci/deps/travis-38.yaml +++ b/ci/deps/travis-38.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index b85e9403038ab..1b0e36e7b6933 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -751,7 +751,7 @@ Imports are alphabetically sorted within these sections. As part of :ref:`Continuous Integration ` checks we run:: - isort --recursive --check-only pandas + isort --check-only pandas to check that imports are correctly formatted as per the `setup.cfg`. @@ -770,8 +770,6 @@ You should run:: to automatically format imports correctly. This will modify your local copy of the files. -The `--recursive` flag can be passed to sort all files in a directory. - Alternatively, you can run a command similar to what was suggested for ``black`` and ``flake8`` :ref:`right above `:: git diff upstream/master --name-only -- "*.py" | xargs -r isort diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d4be9d802d697..cc42f952b1733 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3441,10 +3441,11 @@ for some advanced strategies .. warning:: - pandas requires ``PyTables`` >= 3.0.0. - There is a indexing bug in ``PyTables`` < 3.2 which may appear when querying stores using an index. - If you see a subset of results being returned, upgrade to ``PyTables`` >= 3.2. - Stores created previously will need to be rewritten using the updated version. + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle. Loading pickled data received from + untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index ad5bb5a5b2d72..8ce10136dd2bb 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 1.1 .. toctree:: :maxdepth: 2 + v1.1.1 v1.1.0 Version 1.0 diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst new file mode 100644 index 0000000000000..721f07c865409 --- /dev/null +++ b/doc/source/whatsnew/v1.1.1.rst @@ -0,0 +1,56 @@ +.. _whatsnew_111: + +What's new in 1.1.1 (August 20, 2020) +------------------------------------- + +These are the changes in pandas 1.1.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_111.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ + +- Fixed regression in :meth:`CategoricalIndex.format` where, when stringified scalars had different lengths, the shorter string would be right-filled with spaces, so it had the same length as the longest string (:issue:`35439`) +- Fixed regression in :meth:`Series.truncate` when trying to truncate a single-element series (:issue:`35544`) +- Fixed regression where :meth:`DataFrame.to_numpy` would raise a ``RuntimeError`` for mixed dtypes when converting to ``str`` (:issue:`35455`) +- Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`) +- Fixed regression where :func:`pandas.testing.assert_series_equal` would raise an error when non-numeric dtypes were passed with ``check_exact=True`` (:issue:`35446`) +- Fixed regression in ``.groupby(..).rolling(..)`` where column selection was ignored (:issue:`35486`) +- Fixed regression where :meth:`DataFrame.interpolate` would raise a ``TypeError`` when the :class:`DataFrame` was empty (:issue:`35598`) +- Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) +- Fixed regression in :meth:`DataFrame.diff` with read-only data (:issue:`35559`) +- Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) +- Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) +- Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) +- Fixed regression where :func:`pandas.merge_asof` would raise a ``UnboundLocalError`` when ``left_index``, ``right_index`` and ``tolerance`` were set (:issue:`35558`) +- Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) +- Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` where compiled regular expressions would be ignored during replacement (:issue:`35680`) +- Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` where a list of functions would produce the wrong results if at least one of the functions did not aggregate (:issue:`35490`) +- Fixed memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_111.bug_fixes: + +Bug fixes +~~~~~~~~~ + +- Bug in :class:`~pandas.io.formats.style.Styler` whereby ``cell_ids`` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`) +- Bug in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` where extension dtypes were not ignored when ``check_dtypes`` was set to ``False`` (:issue:`35715`) +- Bug in :meth:`to_timedelta` fails when ``arg`` is a :class:`Series` with ``Int64`` dtype containing null values (:issue:`35574`) +- Bug in ``.groupby(..).rolling(..)`` where passing ``closed`` with column selection would raise a ``ValueError`` (:issue:`35549`) +- Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when ``data`` and ``index`` have mismatched lengths (:issue:`33437`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_111.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.0..v1.1.1|HEAD diff --git a/environment.yml b/environment.yml index 53222624619de..9efb995e29497 100644 --- a/environment.yml +++ b/environment.yml @@ -21,7 +21,7 @@ dependencies: - flake8<3.8.0 # temporary pin, GH#34150 - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions - flake8-rst>=0.6.0,<=0.7.0 # linting of code blocks in rst files - - isort=4.3.21 # check that imports are in the right order + - isort>=5.2.1 # check that imports are in the right order - mypy=0.730 - pycodestyle # used by flake8 @@ -52,7 +52,7 @@ dependencies: - botocore>=1.11 - hypothesis>=3.82 - moto # mock S3 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-cov - pytest-xdist>=1.21 - pytest-asyncio diff --git a/pandas/_config/config.py b/pandas/_config/config.py index f5e16cddeb04c..d7b73a0a685d3 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -442,8 +442,8 @@ def register_option( ValueError if `validator` is specified and `defval` is not a valid value. """ - import tokenize import keyword + import tokenize key = key.lower() @@ -660,8 +660,8 @@ def _build_option_description(k: str) -> str: def pp_options_list(keys: Iterable[str], width=80, _print: bool = False): """ Builds a concise listing of available options, grouped by prefix """ - from textwrap import wrap from itertools import groupby + from textwrap import wrap def pp(name: str, ks: Iterable[str]) -> List[str]: pfx = "- " + name + ".[" if name else "" diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 6b6ead795584f..0a70afda893cf 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1,11 +1,12 @@ import cython from cython import Py_ssize_t -from libc.stdlib cimport malloc, free -from libc.string cimport memmove from libc.math cimport fabs, sqrt +from libc.stdlib cimport free, malloc +from libc.string cimport memmove import numpy as np + cimport numpy as cnp from numpy cimport ( NPY_FLOAT32, @@ -31,12 +32,11 @@ from numpy cimport ( uint32_t, uint64_t, ) + cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.util cimport numeric, get_nat - from pandas._libs.khash cimport ( kh_destroy_int64, kh_get_int64, @@ -46,7 +46,7 @@ from pandas._libs.khash cimport ( kh_resize_int64, khiter_t, ) - +from pandas._libs.util cimport get_nat, numeric import pandas._libs.missing as missing @@ -1200,14 +1200,15 @@ ctypedef fused out_t: @cython.boundscheck(False) @cython.wraparound(False) def diff_2d( - diff_t[:, :] arr, - out_t[:, :] out, + ndarray[diff_t, ndim=2] arr, # TODO(cython 3) update to "const diff_t[:, :] arr" + ndarray[out_t, ndim=2] out, Py_ssize_t periods, int axis, ): cdef: Py_ssize_t i, j, sx, sy, start, stop - bint f_contig = arr.is_f_contig() + bint f_contig = arr.flags.f_contiguous + # bint f_contig = arr.is_f_contig() # TODO(cython 3) # Disable for unsupported dtype combinations, # see https://github.com/cython/cython/issues/2646 diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7c57e6ee9dbfd..38cb973d6dde9 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1,27 +1,51 @@ import cython from cython import Py_ssize_t -from cython cimport floating -from libc.stdlib cimport malloc, free +from cython cimport floating +from libc.stdlib cimport free, malloc import numpy as np + cimport numpy as cnp -from numpy cimport (ndarray, - int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, - uint32_t, uint64_t, float32_t, float64_t, complex64_t, complex128_t) +from numpy cimport ( + complex64_t, + complex128_t, + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + ndarray, + uint8_t, + uint16_t, + uint32_t, + uint64_t, +) from numpy.math cimport NAN -cnp.import_array() -from pandas._libs.util cimport numeric, get_nat +cnp.import_array() -from pandas._libs.algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, - TIEBREAK_MIN, TIEBREAK_MAX, TIEBREAK_FIRST, - TIEBREAK_DENSE) -from pandas._libs.algos import (take_2d_axis1_float64_float64, - groupsort_indexer, tiebreakers) +from pandas._libs.algos cimport ( + TIEBREAK_AVERAGE, + TIEBREAK_DENSE, + TIEBREAK_FIRST, + TIEBREAK_MAX, + TIEBREAK_MIN, + TiebreakEnumType, + swap, +) +from pandas._libs.util cimport get_nat, numeric + +from pandas._libs.algos import ( + groupsort_indexer, + take_2d_axis1_float64_float64, + tiebreakers, +) from pandas._libs.missing cimport checknull + cdef int64_t NPY_NAT = get_nat() _int64_max = np.iinfo(np.int64).max diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index a98820ca57895..f2af04d91a3e3 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -2,10 +2,13 @@ # at https://github.com/veorq/SipHash import cython -from libc.stdlib cimport malloc, free + +from libc.stdlib cimport free, malloc import numpy as np -from numpy cimport ndarray, uint8_t, uint32_t, uint64_t, import_array + +from numpy cimport import_array, ndarray, uint8_t, uint32_t, uint64_t + import_array() from pandas._libs.util cimport is_nan diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index c3dcbb942d7fe..ffaf6d6505955 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -1,60 +1,57 @@ cimport cython - -from cpython.ref cimport PyObject, Py_INCREF -from cpython.mem cimport PyMem_Malloc, PyMem_Free - -from libc.stdlib cimport malloc, free +from cpython.mem cimport PyMem_Free, PyMem_Malloc +from cpython.ref cimport Py_INCREF, PyObject +from libc.stdlib cimport free, malloc import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, uint8_t, uint32_t, float64_t +from numpy cimport float64_t, ndarray, uint8_t, uint32_t from numpy.math cimport NAN + cnp.import_array() +from pandas._libs cimport util from pandas._libs.khash cimport ( - khiter_t, - kh_str_t, - kh_init_str, - kh_put_str, - kh_exist_str, - kh_get_str, - kh_destroy_str, - kh_resize_str, - kh_put_strbox, - kh_get_strbox, - kh_init_strbox, - kh_int64_t, - kh_init_int64, - kh_resize_int64, + kh_destroy_float64, kh_destroy_int64, - kh_get_int64, + kh_destroy_pymap, + kh_destroy_str, + kh_destroy_uint64, + kh_exist_float64, kh_exist_int64, - kh_put_int64, + kh_exist_pymap, + kh_exist_str, + kh_exist_uint64, kh_float64_t, - kh_exist_float64, - kh_put_float64, - kh_init_float64, kh_get_float64, - kh_destroy_float64, - kh_resize_float64, - kh_resize_uint64, - kh_exist_uint64, - kh_destroy_uint64, - kh_put_uint64, + kh_get_int64, + kh_get_pymap, + kh_get_str, + kh_get_strbox, kh_get_uint64, - kh_init_uint64, - kh_destroy_pymap, - kh_exist_pymap, + kh_init_float64, + kh_init_int64, kh_init_pymap, - kh_get_pymap, + kh_init_str, + kh_init_strbox, + kh_init_uint64, + kh_int64_t, + kh_put_float64, + kh_put_int64, kh_put_pymap, + kh_put_str, + kh_put_strbox, + kh_put_uint64, + kh_resize_float64, + kh_resize_int64, kh_resize_pymap, + kh_resize_str, + kh_resize_uint64, + kh_str_t, + khiter_t, ) - - -from pandas._libs cimport util - from pandas._libs.missing cimport checknull diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 35c4b73b47695..d6659cc1895b1 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1,6 +1,7 @@ import warnings import numpy as np + cimport numpy as cnp from numpy cimport ( float32_t, @@ -16,17 +17,16 @@ from numpy cimport ( uint32_t, uint64_t, ) + cnp.import_array() from pandas._libs cimport util - +from pandas._libs.hashtable cimport HashTable from pandas._libs.tslibs.nattype cimport c_NaT as NaT from pandas._libs.tslibs.period cimport is_period_object -from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs.tslibs.timedeltas cimport _Timedelta - -from pandas._libs.hashtable cimport HashTable +from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs import algos, hashtable as _hash from pandas._libs.missing import checknull diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 8b4b490f49b12..4f27fde52414a 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -5,12 +5,15 @@ from cython import Py_ssize_t from cpython.slice cimport PySlice_GetIndicesEx + cdef extern from "Python.h": Py_ssize_t PY_SSIZE_T_MAX import numpy as np + cimport numpy as cnp from numpy cimport NPY_INT64, int64_t + cnp.import_array() from pandas._libs.algos import ensure_int64 diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 95881ebf1385c..6867e8aba7411 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -1,7 +1,8 @@ import numbers from operator import le, lt -from cpython.datetime cimport PyDelta_Check, PyDateTime_IMPORT +from cpython.datetime cimport PyDateTime_IMPORT, PyDelta_Check + PyDateTime_IMPORT from cpython.object cimport ( @@ -16,8 +17,8 @@ from cpython.object cimport ( import cython from cython import Py_ssize_t - import numpy as np + cimport numpy as cnp from numpy cimport ( NPY_QUICKSORT, @@ -30,22 +31,21 @@ from numpy cimport ( ndarray, uint64_t, ) + cnp.import_array() from pandas._libs cimport util - from pandas._libs.hashtable cimport Int64Vector +from pandas._libs.tslibs.timedeltas cimport _Timedelta +from pandas._libs.tslibs.timestamps cimport _Timestamp +from pandas._libs.tslibs.timezones cimport tz_compare from pandas._libs.tslibs.util cimport ( - is_integer_object, is_float_object, + is_integer_object, is_timedelta64_object, ) -from pandas._libs.tslibs.timezones cimport tz_compare -from pandas._libs.tslibs.timestamps cimport _Timestamp -from pandas._libs.tslibs.timedeltas cimport _Timedelta - _VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither']) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 54892a7e4bc77..13c7187923473 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -1,7 +1,7 @@ import cython from cython import Py_ssize_t - import numpy as np + cimport numpy as cnp from numpy cimport ( float32_t, @@ -16,6 +16,7 @@ from numpy cimport ( uint32_t, uint64_t, ) + cnp.import_array() from pandas._libs.algos import ( @@ -640,7 +641,11 @@ def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): # ---------------------------------------------------------------------- from pandas._libs.hashtable cimport ( - HashTable, PyObjectHashTable, UInt64HashTable, Int64HashTable) + HashTable, + Int64HashTable, + PyObjectHashTable, + UInt64HashTable, +) ctypedef fused asof_t: uint8_t diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5ecbb2c3ffd35..eadfcefaac73d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -5,23 +5,24 @@ import warnings import cython from cython import Py_ssize_t -from cpython.object cimport PyObject_RichCompareBool, Py_EQ -from cpython.ref cimport Py_INCREF -from cpython.tuple cimport PyTuple_SET_ITEM, PyTuple_New -from cpython.iterator cimport PyIter_Check -from cpython.sequence cimport PySequence_Check -from cpython.number cimport PyNumber_Check - from cpython.datetime cimport ( - PyDateTime_Check, PyDate_Check, - PyTime_Check, - PyDelta_Check, + PyDateTime_Check, PyDateTime_IMPORT, + PyDelta_Check, + PyTime_Check, ) +from cpython.iterator cimport PyIter_Check +from cpython.number cimport PyNumber_Check +from cpython.object cimport Py_EQ, PyObject_RichCompareBool +from cpython.ref cimport Py_INCREF +from cpython.sequence cimport PySequence_Check +from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM + PyDateTime_IMPORT import numpy as np + cimport numpy as cnp from numpy cimport ( NPY_OBJECT, @@ -39,6 +40,7 @@ from numpy cimport ( uint8_t, uint64_t, ) + cnp.import_array() cdef extern from "numpy/arrayobject.h": @@ -63,28 +65,23 @@ cdef extern from "src/parse_helper.h": int floatify(object, float64_t *result, int *maybe_int) except -1 from pandas._libs cimport util -from pandas._libs.util cimport is_nan, UINT64_MAX, INT64_MAX, INT64_MIN +from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX, is_nan from pandas._libs.tslib import array_to_datetime -from pandas._libs.tslibs.nattype cimport ( - NPY_NAT, - c_NaT as NaT, - checknull_with_nat, -) -from pandas._libs.tslibs.conversion cimport convert_to_tsobject -from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 -from pandas._libs.tslibs.timezones cimport tz_compare -from pandas._libs.tslibs.period cimport is_period_object -from pandas._libs.tslibs.offsets cimport is_offset_object from pandas._libs.missing cimport ( + C_NA, checknull, - isnaobj, is_null_datetime64, is_null_timedelta64, - C_NA, + isnaobj, ) - +from pandas._libs.tslibs.conversion cimport convert_to_tsobject +from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT, checknull_with_nat +from pandas._libs.tslibs.offsets cimport is_offset_object +from pandas._libs.tslibs.period cimport is_period_object +from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 +from pandas._libs.tslibs.timezones cimport tz_compare # constants that will be compared to potentially arbitrarily large # python int @@ -621,35 +618,52 @@ def astype_intsafe(ndarray[object] arr, new_dtype): @cython.wraparound(False) @cython.boundscheck(False) -def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]: - """ - Convert all elements in an array to string. +cpdef ndarray[object] ensure_string_array( + arr, + object na_value=np.nan, + bint convert_na_value=True, + bint copy=True, + bint skipna=True, +): + """Returns a new numpy array with object dtype and only strings and na values. Parameters ---------- - arr : ndarray - The array whose elements we are casting. - skipna : bool, default False + arr : array-like + The values to be converted to str, if needed. + na_value : Any + The value to use for na. For example, np.nan or pd.NA. + convert_na_value : bool, default True + If False, existing na values will be used unchanged in the new array. + copy : bool, default True + Whether to ensure that a new array is returned. + skipna : bool, default True Whether or not to coerce nulls to their stringified form - (e.g. NaN becomes 'nan'). + (e.g. if False, NaN becomes 'nan'). Returns ------- ndarray - A new array with the input array's elements casted. + An array with the input array's elements casted to str or nan-like. """ cdef: - object arr_i - Py_ssize_t i, n = arr.size - ndarray[object] result = np.empty(n, dtype=object) - - for i in range(n): - arr_i = arr[i] + Py_ssize_t i = 0, n = len(arr) - if not (skipna and checknull(arr_i)): - arr_i = str(arr_i) + result = np.asarray(arr, dtype="object") + if copy and result is arr: + result = result.copy() - result[i] = arr_i + for i in range(n): + val = result[i] + if not checknull(val): + result[i] = str(val) + else: + if convert_na_value: + val = na_value + if skipna: + result[i] = val + else: + result[i] = str(val) return result @@ -1317,8 +1331,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: if not isinstance(value, list): value = list(value) - from pandas.core.dtypes.cast import ( - construct_1d_object_array_from_listlike) + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike values = construct_1d_object_array_from_listlike(value) # make contiguous diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index fdd06fe631b97..771e8053ac9be 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -1,27 +1,25 @@ -import cython -from cython import Py_ssize_t - import numbers +import cython +from cython import Py_ssize_t import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t, uint8_t, float64_t +from numpy cimport float64_t, int64_t, ndarray, uint8_t + cnp.import_array() from pandas._libs cimport util - - -from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value from pandas._libs.tslibs.nattype cimport ( c_NaT as NaT, checknull_with_nat, is_null_datetimelike, ) -from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op +from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op from pandas.compat import is_platform_32bit - cdef: float64_t INF = np.inf float64_t NEGINF = -INF @@ -157,7 +155,10 @@ def isnaobj_old(arr: ndarray) -> ndarray: result = np.zeros(n, dtype=np.uint8) for i in range(n): val = arr[i] - result[i] = checknull(val) or val == INF or val == NEGINF + result[i] = ( + checknull(val) + or util.is_float_object(val) and (val == INF or val == NEGINF) + ) return result.view(np.bool_) diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 658600cdfbe6c..d1f897d237c1b 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -10,18 +10,17 @@ from cpython.object cimport ( PyObject_RichCompareBool, ) - import cython from cython import Py_ssize_t - import numpy as np -from numpy cimport ndarray, uint8_t, import_array -import_array() +from numpy cimport import_array, ndarray, uint8_t + +import_array() -from pandas._libs.util cimport UINT8_MAX, is_nan from pandas._libs.missing cimport checknull +from pandas._libs.util cimport UINT8_MAX, is_nan @cython.wraparound(False) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 6ffb036e01595..fa77af6bd5a25 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1,6 +1,8 @@ # Copyright (c) 2012, Lambda Foundry, Inc. # See LICENSE for the license import bz2 +from csv import QUOTE_MINIMAL, QUOTE_NONE, QUOTE_NONNUMERIC +from errno import ENOENT import gzip import io import os @@ -9,17 +11,14 @@ import time import warnings import zipfile -from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE -from errno import ENOENT - from libc.stdlib cimport free -from libc.string cimport strncpy, strlen, strcasecmp +from libc.string cimport strcasecmp, strlen, strncpy import cython from cython import Py_ssize_t from cpython.bytes cimport PyBytes_AsString, PyBytes_FromString -from cpython.exc cimport PyErr_Occurred, PyErr_Fetch +from cpython.exc cimport PyErr_Fetch, PyErr_Occurred from cpython.object cimport PyObject from cpython.ref cimport Py_XDECREF from cpython.unicode cimport PyUnicode_AsUTF8String, PyUnicode_Decode @@ -30,37 +29,59 @@ cdef extern from "Python.h": import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, uint8_t, uint64_t, int64_t, float64_t +from numpy cimport float64_t, int64_t, ndarray, uint8_t, uint64_t + cnp.import_array() from pandas._libs cimport util -from pandas._libs.util cimport UINT64_MAX, INT64_MAX, INT64_MIN +from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX + import pandas._libs.lib as lib from pandas._libs.khash cimport ( - khiter_t, - kh_str_t, kh_init_str, kh_put_str, kh_exist_str, - kh_get_str, kh_destroy_str, - kh_float64_t, kh_get_float64, kh_destroy_float64, - kh_put_float64, kh_init_float64, kh_resize_float64, - kh_strbox_t, kh_put_strbox, kh_get_strbox, kh_init_strbox, + kh_destroy_float64, + kh_destroy_str, + kh_destroy_str_starts, kh_destroy_strbox, - kh_str_starts_t, kh_put_str_starts_item, kh_init_str_starts, - kh_get_str_starts_item, kh_destroy_str_starts, kh_resize_str_starts) + kh_exist_str, + kh_float64_t, + kh_get_float64, + kh_get_str, + kh_get_str_starts_item, + kh_get_strbox, + kh_init_float64, + kh_init_str, + kh_init_str_starts, + kh_init_strbox, + kh_put_float64, + kh_put_str, + kh_put_str_starts_item, + kh_put_strbox, + kh_resize_float64, + kh_resize_str_starts, + kh_str_starts_t, + kh_str_t, + kh_strbox_t, + khiter_t, +) + +from pandas.compat import _get_lzma_file, _import_lzma +from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning from pandas.core.dtypes.common import ( + is_bool_dtype, is_categorical_dtype, - is_integer_dtype, is_float_dtype, - is_bool_dtype, is_object_dtype, is_datetime64_dtype, - pandas_dtype, is_extension_array_dtype) + is_extension_array_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, + pandas_dtype, +) from pandas.core.dtypes.concat import union_categoricals -from pandas.compat import _import_lzma, _get_lzma_file -from pandas.errors import (ParserError, DtypeWarning, - EmptyDataError, ParserWarning) - lzma = _import_lzma() cdef: diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index a01e0c5705dcf..7b36bc8baf891 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -2,15 +2,18 @@ from copy import copy from cython import Py_ssize_t -from libc.stdlib cimport malloc, free +from libc.stdlib cimport free, malloc import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t +from numpy cimport int64_t, ndarray + cnp.import_array() from pandas._libs cimport util -from pandas._libs.lib import maybe_convert_objects, is_scalar + +from pandas._libs.lib import is_scalar, maybe_convert_objects cdef _check_result_array(object obj, Py_ssize_t cnt): diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index da4dd00027395..5c6c15fb50fed 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -16,7 +16,9 @@ from numpy cimport ( ) import numpy as np + cimport numpy as cnp + cnp.import_array() from pandas._libs.lib cimport c_is_list_like diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 7c9575d921dc9..321d7c374d8ec 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -1,9 +1,18 @@ import cython - import numpy as np + cimport numpy as cnp -from numpy cimport (ndarray, uint8_t, int64_t, int32_t, int16_t, int8_t, - float64_t, float32_t) +from numpy cimport ( + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + ndarray, + uint8_t, +) + cnp.import_array() diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 785a4d1f8b923..64fc8d615ea9c 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -1,13 +1,16 @@ import math import numpy as np + from numpy cimport import_array + import_array() from pandas._libs.util cimport is_array -from pandas.core.dtypes.missing import isna, array_equivalent from pandas.core.dtypes.common import is_dtype_equal +from pandas.core.dtypes.missing import array_equivalent, isna + cdef NUMERIC_TYPES = ( bool, @@ -129,6 +132,7 @@ cpdef assert_almost_equal(a, b, if not isiterable(b): from pandas._testing import assert_class_equal + # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) @@ -181,6 +185,7 @@ cpdef assert_almost_equal(a, b, elif isiterable(b): from pandas._testing import assert_class_equal + # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 35d5cd8f1e275..e4128af62d06d 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -7,23 +7,20 @@ from cpython.datetime cimport ( datetime, tzinfo, ) + # import datetime C API PyDateTime_IMPORT cimport numpy as cnp from numpy cimport float64_t, int64_t, ndarray + import numpy as np + cnp.import_array() import pytz -from pandas._libs.util cimport ( - is_datetime64_object, - is_float_object, - is_integer_object, -) - from pandas._libs.tslibs.np_datetime cimport ( _string_to_dts, check_dts_bounds, @@ -34,9 +31,9 @@ from pandas._libs.tslibs.np_datetime cimport ( pydate_to_dt64, pydatetime_to_dt64, ) +from pandas._libs.util cimport is_datetime64_object, is_float_object, is_integer_object from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime - from pandas._libs.tslibs.parsing import parse_datetime_string from pandas._libs.tslibs.conversion cimport ( @@ -45,22 +42,18 @@ from pandas._libs.tslibs.conversion cimport ( convert_datetime_to_tsobject, get_datetime64_nanos, ) - from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_NaT as NaT, c_nat_strings as nat_strings, ) - from pandas._libs.tslibs.timestamps cimport _Timestamp -from pandas._libs.tslibs.timestamps import Timestamp -from pandas._libs.tslibs.tzconversion cimport ( - tz_localize_to_utc_single, -) +from pandas._libs.tslibs.timestamps import Timestamp # Note: this is the only non-tslibs intra-pandas dependency here from pandas._libs.missing cimport checknull_with_nat_and_na +from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single def _test_parse_iso8601(ts: str): diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 00cecd25e5225..6cce2f5e1fd95 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -5,7 +5,7 @@ Cython implementations of functions resembling the stdlib calendar module import cython -from numpy cimport int64_t, int32_t +from numpy cimport int32_t, int64_t # ---------------------------------------------------------------------- # Constants diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 8cc3d25e86340..adf1dfbc1ac72 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -1,44 +1,68 @@ import cython - import numpy as np + cimport numpy as cnp -from numpy cimport int64_t, int32_t, intp_t, ndarray +from numpy cimport int32_t, int64_t, intp_t, ndarray + cnp.import_array() import pytz # stdlib datetime imports -from cpython.datetime cimport (datetime, time, tzinfo, - PyDateTime_Check, PyDate_Check, - PyDateTime_IMPORT) + +from cpython.datetime cimport ( + PyDate_Check, + PyDateTime_Check, + PyDateTime_IMPORT, + datetime, + time, + tzinfo, +) + PyDateTime_IMPORT from pandas._libs.tslibs.base cimport ABCTimestamp - from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, npy_datetimestruct, pandas_datetime_to_datetimestruct, - _string_to_dts, npy_datetime, dt64_to_dtstruct, dtstruct_to_dt64, - get_datetime64_unit, get_datetime64_value, pydatetime_to_dt64, - NPY_DATETIMEUNIT, NPY_FR_ns) -from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime + NPY_DATETIMEUNIT, + NPY_FR_ns, + _string_to_dts, + check_dts_bounds, + dt64_to_dtstruct, + dtstruct_to_dt64, + get_datetime64_unit, + get_datetime64_value, + npy_datetime, + npy_datetimestruct, + pandas_datetime_to_datetimestruct, + pydatetime_to_dt64, +) -from pandas._libs.tslibs.util cimport ( - is_datetime64_object, is_integer_object, is_float_object) +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.timezones cimport ( - is_utc, is_tzlocal, is_fixed_offset, get_utcoffset, get_dst_info, - maybe_get_tz, tz_compare, + get_dst_info, + get_utcoffset, + is_fixed_offset, + is_tzlocal, + is_utc, + maybe_get_tz, + tz_compare, utc_pytz as UTC, ) +from pandas._libs.tslibs.util cimport ( + is_datetime64_object, + is_float_object, + is_integer_object, +) + from pandas._libs.tslibs.parsing import parse_datetime_string from pandas._libs.tslibs.nattype cimport ( NPY_NAT, - checknull_with_nat, c_NaT as NaT, c_nat_strings as nat_strings, + checknull_with_nat, ) - from pandas._libs.tslibs.tzconversion cimport ( tz_convert_utc_to_tzlocal, tz_localize_to_utc_single, diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 1d1f900bc18b3..16fa05c3801c6 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -6,26 +6,37 @@ from locale import LC_TIME import cython from cython import Py_ssize_t - import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t, int32_t, int8_t, uint32_t +from numpy cimport int8_t, int32_t, int64_t, ndarray, uint32_t + cnp.import_array() from pandas._config.localization import set_locale -from pandas._libs.tslibs.ccalendar import MONTHS_FULL, DAYS_FULL +from pandas._libs.tslibs.ccalendar import DAYS_FULL, MONTHS_FULL + from pandas._libs.tslibs.ccalendar cimport ( - get_days_in_month, is_leapyear, dayofweek, get_week_of_year, - get_day_of_year, get_iso_calendar, iso_calendar_t, - month_offset, + dayofweek, + get_day_of_year, + get_days_in_month, get_firstbday, + get_iso_calendar, get_lastbday, + get_week_of_year, + is_leapyear, + iso_calendar_t, + month_offset, ) -from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, pandas_timedeltastruct, dt64_to_dtstruct, - td64_to_tdstruct) from pandas._libs.tslibs.nattype cimport NPY_NAT +from pandas._libs.tslibs.np_datetime cimport ( + dt64_to_dtstruct, + npy_datetimestruct, + pandas_timedeltastruct, + td64_to_tdstruct, +) + from pandas._libs.tslibs.strptime import LocaleTime diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 264013f928d22..73df51832d700 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1,3 +1,10 @@ +from cpython.datetime cimport ( + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + datetime, + timedelta, +) from cpython.object cimport ( Py_EQ, Py_GE, @@ -8,28 +15,19 @@ from cpython.object cimport ( PyObject_RichCompare, ) -from cpython.datetime cimport ( - PyDateTime_Check, - PyDateTime_IMPORT, - PyDelta_Check, - datetime, - timedelta, -) PyDateTime_IMPORT from cpython.version cimport PY_MINOR_VERSION import numpy as np + cimport numpy as cnp from numpy cimport int64_t + cnp.import_array() -from pandas._libs.tslibs.np_datetime cimport ( - get_datetime64_value, - get_timedelta64_value, -) cimport pandas._libs.tslibs.util as util - +from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value # ---------------------------------------------------------------------- # Constants diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 31cc55ad981bb..12aaaf4ce3977 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,5 +1,3 @@ -from cpython.object cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE - from cpython.datetime cimport ( PyDateTime_DATE_GET_HOUR, PyDateTime_DATE_GET_MICROSECOND, @@ -10,11 +8,15 @@ from cpython.datetime cimport ( PyDateTime_GET_YEAR, PyDateTime_IMPORT, ) +from cpython.object cimport Py_EQ, Py_GE, Py_GT, Py_LE, Py_LT, Py_NE + PyDateTime_IMPORT from numpy cimport int64_t + from pandas._libs.tslibs.util cimport get_c_string_buf_and_size + cdef extern from "src/datetime/np_datetime.h": int cmp_npy_datetimestruct(npy_datetimestruct *a, npy_datetimestruct *b) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 9a7ca15a2a1c2..ac2725fc58aee 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1,39 +1,51 @@ -import cython - import operator import re import time from typing import Any import warnings -from cpython.datetime cimport (PyDateTime_IMPORT, - PyDateTime_Check, - PyDate_Check, - PyDelta_Check, - datetime, timedelta, date, - time as dt_time) + +import cython + +from cpython.datetime cimport ( + PyDate_Check, + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + date, + datetime, + time as dt_time, + timedelta, +) + PyDateTime_IMPORT -from dateutil.relativedelta import relativedelta from dateutil.easter import easter - +from dateutil.relativedelta import relativedelta import numpy as np + cimport numpy as cnp from numpy cimport int64_t, ndarray + cnp.import_array() # TODO: formalize having _libs.properties "above" tslibs in the dependency structure + from pandas._libs.properties import cache_readonly from pandas._libs.tslibs cimport util from pandas._libs.tslibs.util cimport ( - is_integer_object, is_datetime64_object, is_float_object, + is_integer_object, ) from pandas._libs.tslibs.ccalendar import ( - MONTH_ALIASES, MONTH_TO_CAL_NUM, weekday_to_int, int_to_weekday, + MONTH_ALIASES, + MONTH_TO_CAL_NUM, + int_to_weekday, + weekday_to_int, ) + from pandas._libs.tslibs.ccalendar cimport ( DAY_NANOS, dayofweek, @@ -47,17 +59,20 @@ from pandas._libs.tslibs.conversion cimport ( ) from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, - dtstruct_to_dt64, dt64_to_dtstruct, + dtstruct_to_dt64, + npy_datetimestruct, pydate_to_dtstruct, ) from pandas._libs.tslibs.tzconversion cimport tz_convert_from_utc_single from .dtypes cimport PeriodDtypeCode from .timedeltas cimport delta_to_nanoseconds + from .timedeltas import Timedelta + from .timestamps cimport _Timestamp + from .timestamps import Timestamp # --------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index c4f369d0d3b3f..8429aebbd85b8 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -9,39 +9,44 @@ from libc.string cimport strchr import cython from cython import Py_ssize_t -from cpython.object cimport PyObject_Str - from cpython.datetime cimport datetime, datetime_new, import_datetime, tzinfo +from cpython.object cimport PyObject_Str from cpython.version cimport PY_VERSION_HEX + import_datetime() import numpy as np + cimport numpy as cnp -from numpy cimport (PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT, - PyArray_IterNew, flatiter, float64_t) +from numpy cimport ( + PyArray_GETITEM, + PyArray_ITER_DATA, + PyArray_ITER_NEXT, + PyArray_IterNew, + flatiter, + float64_t, +) + cnp.import_array() # dateutil compat -from dateutil.tz import (tzoffset, - tzlocal as _dateutil_tzlocal, - tzutc as _dateutil_tzutc, - tzstr as _dateutil_tzstr) + +from dateutil.parser import DEFAULTPARSER, parse as du_parse from dateutil.relativedelta import relativedelta -from dateutil.parser import DEFAULTPARSER -from dateutil.parser import parse as du_parse +from dateutil.tz import ( + tzlocal as _dateutil_tzlocal, + tzoffset, + tzstr as _dateutil_tzstr, + tzutc as _dateutil_tzutc, +) from pandas._config import get_option from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS -from pandas._libs.tslibs.nattype cimport ( - c_nat_strings as nat_strings, - c_NaT as NaT, -) -from pandas._libs.tslibs.util cimport ( - is_array, - get_c_string_buf_and_size, -) +from pandas._libs.tslibs.nattype cimport c_NaT as NaT, c_nat_strings as nat_strings from pandas._libs.tslibs.offsets cimport is_offset_object +from pandas._libs.tslibs.util cimport get_c_string_buf_and_size, is_array + cdef extern from "../src/headers/portable.h": int getdigit_ascii(char c, int default) nogil diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 20961c6da56bd..86b6533f5caf5 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1,96 +1,98 @@ import warnings -from cpython.object cimport PyObject_RichCompareBool, Py_EQ, Py_NE +from cpython.object cimport Py_EQ, Py_NE, PyObject_RichCompareBool +from numpy cimport import_array, int64_t, ndarray -from numpy cimport int64_t, import_array, ndarray import numpy as np + import_array() from libc.stdlib cimport free, malloc +from libc.string cimport memset, strlen from libc.time cimport strftime, tm -from libc.string cimport strlen, memset import cython from cpython.datetime cimport ( - datetime, PyDate_Check, PyDateTime_Check, PyDateTime_IMPORT, PyDelta_Check, + datetime, ) + # import datetime C API PyDateTime_IMPORT from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, - dtstruct_to_dt64, - dt64_to_dtstruct, - pandas_datetime_to_datetimestruct, - check_dts_bounds, NPY_DATETIMEUNIT, NPY_FR_D, NPY_FR_us, + check_dts_bounds, + dt64_to_dtstruct, + dtstruct_to_dt64, + npy_datetimestruct, + pandas_datetime_to_datetimestruct, ) + cdef extern from "src/datetime/np_datetime.h": int64_t npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr, npy_datetimestruct *d) nogil cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.timedeltas import Timedelta -from pandas._libs.tslibs.timedeltas cimport ( - delta_to_nanoseconds, - is_any_td_scalar, -) +from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.ccalendar cimport ( + c_MONTH_NUMBERS, dayofweek, get_day_of_year, - is_leapyear, - get_week_of_year, get_days_in_month, + get_week_of_year, + is_leapyear, ) -from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS +from pandas._libs.tslibs.timedeltas cimport delta_to_nanoseconds, is_any_td_scalar + from pandas._libs.tslibs.conversion import ensure_datetime64ns from pandas._libs.tslibs.dtypes cimport ( - PeriodDtypeBase, - FR_UND, FR_ANN, - FR_QTR, - FR_MTH, - FR_WK, FR_BUS, FR_DAY, FR_HR, FR_MIN, - FR_SEC, FR_MS, - FR_US, + FR_MTH, FR_NS, + FR_QTR, + FR_SEC, + FR_UND, + FR_US, + FR_WK, + PeriodDtypeBase, attrname_to_abbrevs, ) - from pandas._libs.tslibs.parsing cimport get_rule_month + from pandas._libs.tslibs.parsing import parse_time_string + from pandas._libs.tslibs.nattype cimport ( - _nat_scalar_rules, NPY_NAT, - is_null_datetimelike, + _nat_scalar_rules, c_NaT as NaT, c_nat_strings as nat_strings, + is_null_datetimelike, ) from pandas._libs.tslibs.offsets cimport ( BaseOffset, - to_offset, - is_tick_object, is_offset_object, + is_tick_object, + to_offset, ) -from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG +from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG cdef: enum: diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 660b582f73e6e..d2690be905a68 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -1,27 +1,30 @@ """Strptime-related classes and functions. """ -import time -import locale import calendar +import locale import re +import time from cpython.datetime cimport date, tzinfo from _thread import allocate_lock as _thread_allocate_lock +import numpy as np import pytz -import numpy as np from numpy cimport int64_t -from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, dtstruct_to_dt64, npy_datetimestruct) - from pandas._libs.tslibs.nattype cimport ( - checknull_with_nat, NPY_NAT, c_nat_strings as nat_strings, + checknull_with_nat, ) +from pandas._libs.tslibs.np_datetime cimport ( + check_dts_bounds, + dtstruct_to_dt64, + npy_datetimestruct, +) + cdef dict _parse_code_table = {'y': 0, 'Y': 1, diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 8f3a599bf107c..ee32ed53a908b 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -2,39 +2,47 @@ import collections import cython -from cpython.object cimport Py_NE, Py_EQ, PyObject_RichCompare +from cpython.object cimport Py_EQ, Py_NE, PyObject_RichCompare import numpy as np + cimport numpy as cnp from numpy cimport int64_t, ndarray + cnp.import_array() -from cpython.datetime cimport (timedelta, - PyDateTime_Check, PyDelta_Check, - PyDateTime_IMPORT) +from cpython.datetime cimport ( + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + timedelta, +) + PyDateTime_IMPORT cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.util cimport ( - is_timedelta64_object, is_datetime64_object, is_integer_object, - is_float_object, is_array -) - from pandas._libs.tslibs.base cimport ABCTimestamp - from pandas._libs.tslibs.conversion cimport cast_from_unit - -from pandas._libs.tslibs.np_datetime cimport ( - cmp_scalar, td64_to_tdstruct, pandas_timedeltastruct) - from pandas._libs.tslibs.nattype cimport ( - checknull_with_nat, NPY_NAT, c_NaT as NaT, c_nat_strings as nat_strings, + checknull_with_nat, +) +from pandas._libs.tslibs.np_datetime cimport ( + cmp_scalar, + pandas_timedeltastruct, + td64_to_tdstruct, ) from pandas._libs.tslibs.offsets cimport is_tick_object +from pandas._libs.tslibs.util cimport ( + is_array, + is_datetime64_object, + is_float_object, + is_integer_object, + is_timedelta64_object, +) # ---------------------------------------------------------------------- # Constants diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 8cef685933863..bddfc30d86a53 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -9,54 +9,66 @@ shadows the python class, where we do any heavy lifting. import warnings import numpy as np + cimport numpy as cnp -from numpy cimport int64_t, int8_t, uint8_t, ndarray -cnp.import_array() +from numpy cimport int8_t, int64_t, ndarray, uint8_t -from cpython.object cimport (PyObject_RichCompareBool, PyObject_RichCompare, - Py_EQ, Py_NE) +cnp.import_array() -from cpython.datetime cimport ( - datetime, - time, - tzinfo, - tzinfo as tzinfo_type, # alias bc `tzinfo` is a kwarg below +from cpython.datetime cimport ( # alias bc `tzinfo` is a kwarg below PyDateTime_Check, + PyDateTime_IMPORT, PyDelta_Check, PyTZInfo_Check, - PyDateTime_IMPORT, -) -PyDateTime_IMPORT - -from pandas._libs.tslibs.util cimport ( - is_datetime64_object, is_float_object, is_integer_object, - is_timedelta64_object, is_array, + datetime, + time, + tzinfo as tzinfo_type, ) +from cpython.object cimport Py_EQ, Py_NE, PyObject_RichCompare, PyObject_RichCompareBool -from pandas._libs.tslibs.base cimport ABCTimestamp +PyDateTime_IMPORT from pandas._libs.tslibs cimport ccalendar - +from pandas._libs.tslibs.base cimport ABCTimestamp from pandas._libs.tslibs.conversion cimport ( _TSObject, - convert_to_tsobject, convert_datetime_to_tsobject, + convert_to_tsobject, normalize_i8_stamp, ) -from pandas._libs.tslibs.fields import get_start_end_field, get_date_name_field +from pandas._libs.tslibs.util cimport ( + is_array, + is_datetime64_object, + is_float_object, + is_integer_object, + is_timedelta64_object, +) + +from pandas._libs.tslibs.fields import get_date_name_field, get_start_end_field + from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, npy_datetimestruct, dt64_to_dtstruct, + check_dts_bounds, cmp_scalar, + dt64_to_dtstruct, + npy_datetimestruct, pydatetime_to_dt64, ) + from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime -from pandas._libs.tslibs.offsets cimport to_offset, is_offset_object -from pandas._libs.tslibs.timedeltas cimport is_any_td_scalar, delta_to_nanoseconds + +from pandas._libs.tslibs.offsets cimport is_offset_object, to_offset +from pandas._libs.tslibs.timedeltas cimport delta_to_nanoseconds, is_any_td_scalar + from pandas._libs.tslibs.timedeltas import Timedelta + from pandas._libs.tslibs.timezones cimport ( - is_utc, maybe_get_tz, treat_tz_as_pytz, utc_pytz as UTC, - get_timezone, tz_compare, + get_timezone, + is_utc, + maybe_get_tz, + treat_tz_as_pytz, + tz_compare, + utc_pytz as UTC, ) from pandas._libs.tslibs.tzconversion cimport ( tz_convert_from_utc_single, diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index a8c785704d8e8..b82291a71057e 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -1,27 +1,31 @@ from datetime import timezone + from cpython.datetime cimport datetime, timedelta, tzinfo # dateutil compat + from dateutil.tz import ( gettz as dateutil_gettz, tzfile as _dateutil_tzfile, tzlocal as _dateutil_tzlocal, tzutc as _dateutil_tzutc, ) - - -from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo import pytz +from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo + UTC = pytz.utc import numpy as np + cimport numpy as cnp from numpy cimport int64_t + cnp.import_array() # ---------------------------------------------------------------------- -from pandas._libs.tslibs.util cimport is_integer_object, get_nat +from pandas._libs.tslibs.util cimport get_nat, is_integer_object + cdef int64_t NPY_NAT = get_nat() cdef tzinfo utc_stdlib = timezone.utc diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 606639af16a18..2b148cd8849f1 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -5,21 +5,27 @@ import cython from cython import Py_ssize_t from cpython.datetime cimport ( - PyDateTime_IMPORT, PyDelta_Check, datetime, timedelta, tzinfo) + PyDateTime_IMPORT, + PyDelta_Check, + datetime, + timedelta, + tzinfo, +) + PyDateTime_IMPORT -import pytz from dateutil.tz import tzutc - import numpy as np +import pytz + cimport numpy as cnp -from numpy cimport ndarray, int64_t, uint8_t, intp_t +from numpy cimport int64_t, intp_t, ndarray, uint8_t + cnp.import_array() from pandas._libs.tslibs.ccalendar cimport DAY_NANOS, HOUR_NANOS from pandas._libs.tslibs.nattype cimport NPY_NAT -from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, dt64_to_dtstruct) +from pandas._libs.tslibs.np_datetime cimport dt64_to_dtstruct, npy_datetimestruct from pandas._libs.tslibs.timezones cimport ( get_dst_info, get_utcoffset, diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index c8f8daf6724c2..bdc00f6c6e21a 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -1,18 +1,21 @@ import cython -from cpython.datetime cimport datetime, date, time, tzinfo +from cpython.datetime cimport date, datetime, time, tzinfo import numpy as np + from numpy cimport int64_t, intp_t, ndarray from .conversion cimport normalize_i8_stamp + from .dtypes import Resolution + from .nattype cimport NPY_NAT, c_NaT as NaT -from .np_datetime cimport npy_datetimestruct, dt64_to_dtstruct +from .np_datetime cimport dt64_to_dtstruct, npy_datetimestruct from .offsets cimport to_offset from .period cimport get_period_ordinal from .timestamps cimport create_timestamp_from_ts -from .timezones cimport is_utc, is_tzlocal, get_dst_info +from .timezones cimport get_dst_info, is_tzlocal, is_utc from .tzconversion cimport tz_convert_utc_to_tzlocal # ------------------------------------------------------------------------- diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 362d0e6263697..3ec4547d223ce 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -2,13 +2,15 @@ import cython from cython import Py_ssize_t -from libcpp.deque cimport deque -from libc.stdlib cimport malloc, free +from libc.stdlib cimport free, malloc +from libcpp.deque cimport deque import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t, float64_t, float32_t, uint8_t +from numpy cimport float32_t, float64_t, int64_t, ndarray, uint8_t + cnp.import_array() @@ -22,6 +24,7 @@ from pandas._libs.algos import is_monotonic from pandas._libs.util cimport numeric + cdef extern from "../src/skiplist.h": ctypedef struct node_t: node_t **next diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 8a1e7feb57ace..9af1159a805ec 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -1,7 +1,8 @@ # cython: boundscheck=False, wraparound=False, cdivision=True import numpy as np -from numpy cimport ndarray, int64_t + +from numpy cimport int64_t, ndarray # Cython routines for window indexers diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 2d5b31d7ccbcf..40c39aabb7a7a 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -5,8 +5,8 @@ from cpython.bytes cimport PyBytes_GET_SIZE from cpython.unicode cimport PyUnicode_GET_SIZE import numpy as np -from numpy cimport ndarray, uint8_t +from numpy cimport ndarray, uint8_t ctypedef fused pandas_string: str diff --git a/pandas/_testing.py b/pandas/_testing.py index fc6df7a95e348..ef6232fa6d575 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -9,7 +9,7 @@ from shutil import rmtree import string import tempfile -from typing import Any, Callable, List, Optional, Type, Union, cast +from typing import Any, Callable, ContextManager, List, Optional, Type, Union, cast import warnings import zipfile @@ -535,7 +535,7 @@ def rands(nchars): def close(fignum=None): - from matplotlib.pyplot import get_fignums, close as _close + from matplotlib.pyplot import close as _close, get_fignums if fignum is None: for fignum in get_fignums(): @@ -1339,10 +1339,8 @@ def assert_series_equal( else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - if check_exact: - if not is_numeric_dtype(left.dtype): - raise AssertionError("check_exact may only be used with numeric Series") - + if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype): + # Only check exact if dtype is numeric assert_numpy_array_equal( left._values, right._values, @@ -1379,12 +1377,18 @@ def assert_series_equal( ) elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype): assert_extension_array_equal( - left._values, right._values, index_values=np.asarray(left.index) + left._values, + right._values, + check_dtype=check_dtype, + index_values=np.asarray(left.index), ) elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( - left._values, right._values, index_values=np.asarray(left.index) + left._values, + right._values, + check_dtype=check_dtype, + index_values=np.asarray(left.index), ) else: _testing.assert_almost_equal( @@ -2880,9 +2884,7 @@ def convert_rows_list_to_csv_str(rows_list: List[str]): return expected -def external_error_raised( - expected_exception: Type[Exception], -) -> Callable[[Type[Exception], None], None]: +def external_error_raised(expected_exception: Type[Exception],) -> ContextManager: """ Helper function to mark pytest.raises that have an external error message. diff --git a/pandas/_typing.py b/pandas/_typing.py index 8e98833ad37f7..76ec527e6e258 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -24,13 +24,15 @@ # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: from pandas._libs import Period, Timedelta, Timestamp # noqa: F401 - from pandas.core.arrays.base import ExtensionArray # noqa: F401 + from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 - from pandas.core.indexes.base import Index # noqa: F401 - from pandas.core.generic import NDFrame # noqa: F401 + from pandas import Interval # noqa: F401 - from pandas.core.series import Series # noqa: F401 + from pandas.core.arrays.base import ExtensionArray # noqa: F401 from pandas.core.frame import DataFrame # noqa: F401 + from pandas.core.generic import NDFrame # noqa: F401 + from pandas.core.indexes.base import Index # noqa: F401 + from pandas.core.series import Series # noqa: F401 # array-like diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 0484de3fa165d..015b203a60256 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -14,7 +14,7 @@ from pandas import Index if TYPE_CHECKING: - from pandas import Series, DataFrame + from pandas import DataFrame, Series def load_reduce(self): diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 733dbeed34b72..6d44cf917a07a 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -15,7 +15,7 @@ from pandas.core.construction import create_series_with_explicit_dtype if TYPE_CHECKING: - from pandas import DataFrame, Series, Index + from pandas import DataFrame, Index, Series ResType = Dict[int, Any] @@ -389,6 +389,8 @@ def series_generator(self): blk = mgr.blocks[0] for (arr, name) in zip(values, self.index): + # GH#35462 re-pin mgr in case setitem changed it + ser._mgr = mgr blk.values = arr ser.name = name yield ser diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index db9cfd9d7fc59..6e5c7bc699962 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -520,7 +520,7 @@ def _from_inferred_categories( ------- Categorical """ - from pandas import Index, to_numeric, to_datetime, to_timedelta + from pandas import Index, to_datetime, to_numeric, to_timedelta cats = Index(inferred_categories) known_categories = ( @@ -1403,7 +1403,7 @@ def value_counts(self, dropna=True): -------- Series.value_counts """ - from pandas import Series, CategoricalIndex + from pandas import CategoricalIndex, Series code, cat = self._codes, self.categories ncat, mask = len(cat), 0 <= code diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ee4d43fdb3bc2..c6945e2f78b5a 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -959,7 +959,7 @@ def value_counts(self, dropna=False): ------- Series """ - from pandas import Series, Index + from pandas import Index, Series if dropna: values = self[~self.isna()]._data diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index b0958af41158c..57df067c7b16e 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -116,6 +116,7 @@ def __from_arrow__( Construct IntegerArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask pyarrow_type = pyarrow.from_numpy_dtype(self.type) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index c861d25afd13f..ed2437cc061bd 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1105,6 +1105,7 @@ def __arrow_array__(self, type=None): Convert myself into a pyarrow Array. """ import pyarrow + from pandas.core.arrays._arrow_utils import ArrowIntervalType try: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8d5cb12d60e4d..fe78481d99d30 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -300,6 +300,7 @@ def __arrow_array__(self, type=None): Convert myself into a pyarrow Array. """ import pyarrow + from pandas.core.arrays._arrow_utils import ArrowPeriodType if type is not None: diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 8a30d2b954b55..da8d695c59b9e 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -87,8 +87,8 @@ def from_coo(cls, A, dense_index=False): 1 0 3.0 dtype: Sparse[float64, nan] """ - from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series from pandas import Series + from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series result = _coo_to_sparse_series(A, dense_index=dense_index) result = Series(result.array, index=result.index, copy=False) @@ -253,9 +253,10 @@ def from_spmatrix(cls, data, index=None, columns=None): 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """ - from pandas import DataFrame from pandas._libs.sparse import IntIndex + from pandas import DataFrame + data = data.tocsc() index, columns = cls._prep_index(data, index, columns) n_rows, n_columns = data.shape @@ -354,8 +355,8 @@ def density(self) -> float: @staticmethod def _prep_index(data, index, columns): - import pandas.core.indexes.base as ibase from pandas.core.indexes.api import ensure_index + import pandas.core.indexes.base as ibase N, K = data.shape if index is None: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index d8db196e4b92f..1531f7b292365 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -234,7 +234,7 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype` is not a ``SparseDtype`` and `data` is a ``SparseArray``. - kind : {'int', 'block'}, default 'int' + kind : {'integer', 'block'}, default 'integer' The type of storage for sparse locations. * 'block': Stores a `block` and `block_length` for each diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index fddd3af858f77..a4778869aee24 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -178,11 +178,10 @@ class StringArray(PandasArray): def __init__(self, values, copy=False): values = extract_array(values) - skip_validation = isinstance(values, type(self)) super().__init__(values, copy=copy) self._dtype = StringDtype() - if not skip_validation: + if not isinstance(values, type(self)): self._validate() def _validate(self): @@ -201,23 +200,11 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): assert dtype == "string" result = np.asarray(scalars, dtype="object") - if copy and result is scalars: - result = result.copy() - - # Standardize all missing-like values to NA - # TODO: it would be nice to do this in _validate / lib.is_string_array - # We are already doing a scan over the values there. - na_values = isna(result) - has_nans = na_values.any() - if has_nans and result is scalars: - # force a copy now, if we haven't already - result = result.copy() - - # convert to str, then to object to avoid dtype like '>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype=np.dtype('str')) array(['1.0', '2.0', None], dtype=object) """ - subarr = np.array(values, dtype=dtype, copy=copy) if dtype is not None and dtype.kind == "U": - # GH-21083 - # We can't just return np.array(subarr, dtype='str') since - # NumPy will convert the non-string objects into strings - # Including NA values. Se we have to go - # string -> object -> update NA, which requires an - # additional pass over the data. - na_values = isna(values) - subarr2 = subarr.astype(object) - subarr2[na_values] = np.asarray(values, dtype=object)[na_values] - subarr = subarr2 + subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy) + else: + subarr = np.array(values, dtype=dtype, copy=copy) return subarr diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 22480fbc47508..8350e136417b1 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -30,12 +30,13 @@ if TYPE_CHECKING: import pyarrow # noqa: F401 + + from pandas import Categorical # noqa: F401 from pandas.core.arrays import ( # noqa: F401 + DatetimeArray, IntervalArray, PeriodArray, - DatetimeArray, ) - from pandas import Categorical # noqa: F401 str_type = str @@ -391,12 +392,13 @@ def __repr__(self) -> str_type: @staticmethod def _hash_categories(categories, ordered: Ordered = True) -> int: + from pandas.core.dtypes.common import DT64NS_DTYPE, is_datetime64tz_dtype + from pandas.core.util.hashing import ( - hash_array, _combine_hash_arrays, + hash_array, hash_tuples, ) - from pandas.core.dtypes.common import is_datetime64tz_dtype, DT64NS_DTYPE if len(categories) and isinstance(categories[0], tuple): # assumes if any individual category is a tuple, then all our. ATM @@ -939,6 +941,7 @@ def __from_arrow__( Construct PeriodArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from pandas.core.arrays import PeriodArray from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask @@ -1136,6 +1139,7 @@ def __from_arrow__( Construct IntervalArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from pandas.core.arrays import IntervalArray if isinstance(array, pyarrow.Array): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f52341ed782d8..041121d60ad33 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -150,6 +150,7 @@ if TYPE_CHECKING: from pandas.core.groupby.generic import DataFrameGroupBy + from pandas.io.formats.style import Styler # --------------------------------------------------------------------- @@ -1370,6 +1371,8 @@ def to_numpy( result = self._mgr.as_array( transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value ) + if result.dtype is not dtype: + result = np.array(result, dtype=dtype, copy=False) return result @@ -4775,7 +4778,7 @@ def _maybe_casted_values(index, labels=None): # we can have situations where the whole mask is -1, # meaning there is nothing found in labels, so make all nan's - if mask.all(): + if mask.size > 0 and mask.all(): dtype = index.dtype fill_value = na_value_for_dtype(dtype) values = construct_1d_arraylike_from_scalar( @@ -5204,8 +5207,9 @@ def duplicated( 4 True dtype: bool """ + from pandas._libs.hashtable import _SIZE_HINT_LIMIT, duplicated_int64 + from pandas.core.sorting import get_group_index - from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT if self.empty: return self._constructor_sliced(dtype=bool) @@ -7867,8 +7871,8 @@ def join( def _join_compat( self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False ): - from pandas.core.reshape.merge import merge from pandas.core.reshape.concat import concat + from pandas.core.reshape.merge import merge if isinstance(other, Series): if other.name is None: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e46fde1f59f16..8bd1dbea4696f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6799,6 +6799,9 @@ def interpolate( obj = self.T if should_transpose else self + if obj.empty: + return self.copy() + if method not in fillna_methods: axis = self._info_axis_number @@ -9397,7 +9400,7 @@ def truncate( if before > after: raise ValueError(f"Truncate: {after} must be after {before}") - if ax.is_monotonic_decreasing: + if len(ax) > 1 and ax.is_monotonic_decreasing: before, after = after, before slicer = [slice(None, None)] * self._AXIS_LEN diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ec7b14f27c5a1..f5858c5c54f1d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -322,11 +322,14 @@ def _aggregate_multiple_funcs(self, arg): # let higher level handle return results - output = self._wrap_aggregated_output(results) + output = self._wrap_aggregated_output(results, index=None) return self.obj._constructor_expanddim(output, columns=columns) + # TODO: index should not be Optional - see GH 35490 def _wrap_series_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -335,7 +338,7 @@ def _wrap_series_output( ---------- output : Mapping[base.OutputKey, Union[Series, np.ndarray]] Data to wrap. - index : pd.Index + index : pd.Index or None Index to apply to the output. Returns @@ -363,8 +366,11 @@ def _wrap_series_output( return result + # TODO: Remove index argument, use self.grouper.result_index, see GH 35490 def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -383,9 +389,7 @@ def _wrap_aggregated_output( In the vast majority of cases output will only contain one element. The exception is operations that expand dimensions, like ohlc. """ - result = self._wrap_series_output( - output=output, index=self.grouper.result_index - ) + result = self._wrap_series_output(output=output, index=index) return self._reindex_output(result) def _wrap_transformed_output( @@ -681,8 +685,8 @@ def value_counts( self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): - from pandas.core.reshape.tile import cut from pandas.core.reshape.merge import _get_join_indexers + from pandas.core.reshape.tile import cut if bins is not None and not np.iterable(bins): # scalar bins cannot be done at top level @@ -1714,7 +1718,9 @@ def _insert_inaxis_grouper_inplace(self, result): result.insert(0, name, lev) def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> DataFrame: """ Wraps the output of DataFrameGroupBy aggregations into the expected result. @@ -1739,8 +1745,7 @@ def _wrap_aggregated_output( self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: - index = self.grouper.result_index - result.index = index + result.index = self.grouper.result_index if self.axis == 1: result = result.T diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ac45222625569..11d0c8e42f745 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -973,7 +973,9 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): return self._wrap_transformed_output(output) - def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]): + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, np.ndarray], index: Optional[Index] + ): raise AbstractMethodError(self) def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): @@ -1048,7 +1050,7 @@ def _cython_agg_general( if len(output) == 0: raise DataError("No numeric types to aggregate") - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) def _python_agg_general( self, func, *args, engine="cython", engine_kwargs=None, **kwargs @@ -1101,7 +1103,7 @@ def _python_agg_general( output[key] = maybe_cast_result(values[mask], result) - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) def _concat_objects(self, keys, values, not_indexed_same: bool = False): from pandas.core.reshape.concat import concat @@ -2521,7 +2523,7 @@ def _get_cythonized_result( raise TypeError(error_msg) if aggregate: - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) else: return self._wrap_transformed_output(output) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 67003dffb90bb..8239a792c65dd 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -237,7 +237,6 @@ def __new__(cls, *args, **kwargs): # core/groupby/grouper.py::Grouper # raising these warnings from TimeGrouper directly would fail the test: # tests/resample/test_deprecated.py::test_deprecating_on_loffset_and_base - # hacky way to set the stacklevel: if cls is TimeGrouper it means # that the call comes from a pandas internal call of resample, # otherwise it comes from pd.Grouper diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 986d6323e704e..1be381e38b157 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5731,9 +5731,9 @@ def _maybe_cast_data_without_dtype(subarr): """ # Runtime import needed bc IntervalArray imports Index from pandas.core.arrays import ( + DatetimeArray, IntervalArray, PeriodArray, - DatetimeArray, TimedeltaArray, ) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index b0b008de69a94..74b235655e345 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -20,7 +20,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core import accessor from pandas.core.algorithms import take_1d @@ -348,12 +348,12 @@ def _format_attrs(self): return attrs def _format_with_header(self, header, na_rep="NaN") -> List[str]: - from pandas.io.formats.format import format_array + from pandas.io.formats.printing import pprint_thing - formatted_values = format_array( - self._values, formatter=None, na_rep=na_rep, justify="left" - ) - result = ibase.trim_front(formatted_values) + result = [ + pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep + for x in self._values + ] return header + result # -------------------------------------------------------------------- diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index e5e98039ff77b..eee610681087d 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Any, List, Optional +from typing import Any, Optional import warnings import numpy as np @@ -33,8 +33,6 @@ from pandas.core.indexes.numeric import Int64Index from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.io.formats.printing import pprint_thing - _empty_range = range(0) @@ -197,9 +195,6 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None - def _format_with_header(self, header, na_rep="NaN") -> List[str]: - return header + [pprint_thing(x) for x in self._range] - # -------------------------------------------------------------------- _deprecation_message = ( "RangeIndex.{} is deprecated and will be " diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6ca6eca1ff829..f4f4a3666a84e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -105,7 +105,6 @@ class Block(PandasObject): is_extension = False _can_hold_na = False _can_consolidate = True - _verify_integrity = True _validate_ndim = True @classmethod @@ -1525,7 +1524,6 @@ class ExtensionBlock(Block): """ _can_consolidate = False - _verify_integrity = False _validate_ndim = False is_extension = True @@ -2613,7 +2611,6 @@ def _replace_coerce( class CategoricalBlock(ExtensionBlock): __slots__ = () is_categorical = True - _verify_integrity = True _can_hold_na = True should_store = Block.should_store diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 895385b170c91..4c3805f812bb0 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2,7 +2,17 @@ import itertools import operator import re -from typing import DefaultDict, Dict, List, Optional, Sequence, Tuple, TypeVar, Union +from typing import ( + DefaultDict, + Dict, + List, + Optional, + Pattern, + Sequence, + Tuple, + TypeVar, + Union, +) import warnings import numpy as np @@ -312,7 +322,7 @@ def _verify_integrity(self) -> None: mgr_shape = self.shape tot_items = sum(len(x.mgr_locs) for x in self.blocks) for block in self.blocks: - if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: + if block.shape[1:] != mgr_shape[1:]: raise construction_error(tot_items, block.shape[1:], self.axes) if len(self.items) != tot_items: raise AssertionError( @@ -551,6 +561,24 @@ def interpolate(self, **kwargs) -> "BlockManager": return self.apply("interpolate", **kwargs) def shift(self, periods: int, axis: int, fill_value) -> "BlockManager": + if axis == 0 and self.ndim == 2 and self.nblocks > 1: + # GH#35488 we need to watch out for multi-block cases + ncols = self.shape[0] + if periods > 0: + indexer = [-1] * periods + list(range(ncols - periods)) + else: + nper = abs(periods) + indexer = list(range(nper, ncols)) + [-1] * nper + result = self.reindex_indexer( + self.items, + indexer, + axis=0, + fill_value=fill_value, + allow_dups=True, + consolidate=False, + ) + return result + return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value) def fillna(self, value, limit, inplace: bool, downcast) -> "BlockManager": @@ -849,6 +877,8 @@ def _interleave(self, dtype=None, na_value=lib.no_default) -> np.ndarray: dtype = dtype.subtype elif is_extension_array_dtype(dtype): dtype = "object" + elif is_dtype_equal(dtype, str): + dtype = "object" result = np.empty(self.shape, dtype=dtype) @@ -1213,6 +1243,7 @@ def reindex_indexer( fill_value=None, allow_dups: bool = False, copy: bool = True, + consolidate: bool = True, ) -> T: """ Parameters @@ -1223,7 +1254,8 @@ def reindex_indexer( fill_value : object, default None allow_dups : bool, default False copy : bool, default True - + consolidate: bool, default True + Whether to consolidate inplace before reindexing. pandas-indexer with -1's only. """ @@ -1236,7 +1268,8 @@ def reindex_indexer( result.axes[axis] = new_axis return result - self._consolidate_inplace() + if consolidate: + self._consolidate_inplace() # some axes don't allow reindexing with dups if not allow_dups: @@ -1899,7 +1932,10 @@ def _merge_blocks( def _compare_or_regex_search( - a: ArrayLike, b: Scalar, regex: bool = False, mask: Optional[ArrayLike] = None + a: ArrayLike, + b: Union[Scalar, Pattern], + regex: bool = False, + mask: Optional[ArrayLike] = None, ) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values @@ -1910,7 +1946,7 @@ def _compare_or_regex_search( Parameters ---------- a : array_like - b : scalar + b : scalar or regex pattern regex : bool, default False mask : array_like or None (default) @@ -1920,7 +1956,7 @@ def _compare_or_regex_search( """ def _check_comparison_types( - result: Union[ArrayLike, bool], a: ArrayLike, b: Scalar, + result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern], ): """ Raises an error if the two arrays (a,b) cannot be compared. @@ -1941,7 +1977,7 @@ def _check_comparison_types( else: op = np.vectorize( lambda x: bool(re.search(b, x)) - if isinstance(x, str) and isinstance(b, str) + if isinstance(x, str) and isinstance(b, (str, Pattern)) else False ) diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index fd9a9a5ef6c93..6eedf72726acb 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -5,8 +5,8 @@ from pandas._typing import ArrayLike if TYPE_CHECKING: - from pandas.core.internals.managers import BlockManager # noqa:F401 from pandas.core.internals.blocks import Block # noqa:F401 + from pandas.core.internals.managers import BlockManager # noqa:F401 def operate_blockwise( diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 27b331babe692..2349cb1dcc0c7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1667,7 +1667,7 @@ def _get_merge_keys(self): msg = ( f"incompatible tolerance {self.tolerance}, must be compat " - f"with type {repr(lk.dtype)}" + f"with type {repr(lt.dtype)}" ) if needs_i8_conversion(lt): diff --git a/pandas/core/strings.py b/pandas/core/strings.py index a1db7742916de..6702bf519c52e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -155,7 +155,7 @@ def _map_stringarray( an ndarray. """ - from pandas.arrays import IntegerArray, StringArray, BooleanArray + from pandas.arrays import BooleanArray, IntegerArray, StringArray mask = isna(arr) @@ -2186,7 +2186,7 @@ def _wrap_result( returns_string=True, ): - from pandas import Index, Series, MultiIndex + from pandas import Index, MultiIndex, Series # for category, we do the stuff on the categories, so blow it up # to the full series again @@ -2292,7 +2292,7 @@ def _get_series_list(self, others): list of Series Others transformed into list of Series. """ - from pandas import Series, DataFrame + from pandas import DataFrame, Series # self._orig is either Series or Index idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0adab143f6052..7aac2f793f61a 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -53,9 +53,10 @@ from pandas.core.indexes.datetimes import DatetimeIndex if TYPE_CHECKING: - from pandas import Series # noqa:F401 from pandas._libs.tslibs.nattype import NaTType # noqa:F401 + from pandas import Series # noqa:F401 + # --------------------------------------------------------------------- # types used in annotations @@ -876,7 +877,7 @@ def _assemble_from_unit_mappings(arg, errors, tz): ------- Series """ - from pandas import to_timedelta, to_numeric, DataFrame + from pandas import DataFrame, to_numeric, to_timedelta arg = DataFrame(arg) if not arg.columns.is_unique: diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 41548931f17f8..cff4695603d06 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -40,13 +40,13 @@ def to_numeric(arg, errors="raise", downcast=None): - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. - If 'ignore', then invalid parsing will return the input. - downcast : {'int', 'signed', 'unsigned', 'float'}, default None + downcast : {'integer', 'signed', 'unsigned', 'float'}, default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - - 'int' or 'signed': smallest signed int dtype (min.: np.int8) + - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 1b56b6d5a46fa..d79b9f4092325 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -275,7 +275,7 @@ def hash_array( # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: - from pandas import factorize, Categorical, Index + from pandas import Categorical, Index, factorize codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 58e7841d4dde5..51a067427e867 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -52,7 +52,7 @@ def __init__(self, obj, *args, **kwargs): kwargs.pop("parent", None) groupby = kwargs.pop("groupby", None) if groupby is None: - groupby, obj = obj, obj.obj + groupby, obj = obj, obj._selected_obj self._groupby = groupby self._groupby.mutated = True self._groupby.grouper.mutated = True diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 0898836ed2e0e..7cbe34cdebf9f 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -1,6 +1,6 @@ """Indexer objects for computing start/end window bounds for rolling operations""" from datetime import timedelta -from typing import Dict, Optional, Tuple, Type, Union +from typing import Dict, Optional, Tuple, Type import numpy as np @@ -265,7 +265,8 @@ def __init__( index_array: Optional[np.ndarray], window_size: int, groupby_indicies: Dict, - rolling_indexer: Union[Type[FixedWindowIndexer], Type[VariableWindowIndexer]], + rolling_indexer: Type[BaseIndexer], + indexer_kwargs: Optional[Dict], **kwargs, ): """ @@ -276,7 +277,10 @@ def __init__( """ self.groupby_indicies = groupby_indicies self.rolling_indexer = rolling_indexer - super().__init__(index_array, window_size, **kwargs) + self.indexer_kwargs = indexer_kwargs or {} + super().__init__( + index_array, self.indexer_kwargs.pop("window_size", window_size), **kwargs + ) @Appender(get_window_bounds_doc) def get_window_bounds( @@ -298,7 +302,9 @@ def get_window_bounds( else: index_array = self.index_array indexer = self.rolling_indexer( - index_array=index_array, window_size=self.window_size, + index_array=index_array, + window_size=self.window_size, + **self.indexer_kwargs, ) start, end = indexer.get_window_bounds( len(indicies), min_periods, center, closed @@ -319,4 +325,10 @@ def get_window_bounds( end_arrays.append(window_indicies.take(end)) start = np.concatenate(start_arrays) end = np.concatenate(end_arrays) + # GH 35552: Need to adjust start and end based on the nans appended to values + # when center=True + if num_values > len(start): + offset = num_values - len(start) + start = np.concatenate([start, np.array([end[-1]] * offset)]) + end = np.concatenate([end, np.array([end[-1]] * offset)]) return start, end diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 445f179248226..d727881f8285a 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -145,7 +145,7 @@ class _Window(PandasObject, ShallowMixin, SelectionMixin): def __init__( self, - obj, + obj: FrameOrSeries, window=None, min_periods: Optional[int] = None, center: bool = False, @@ -2196,7 +2196,7 @@ def _apply( # Cannot use _wrap_outputs because we calculate the result all at once # Compose MultiIndex result from grouping levels then rolling level # Aggregate the MultiIndex data as tuples then the level names - grouped_object_index = self._groupby._selected_obj.index + grouped_object_index = self.obj.index grouped_index_name = [grouped_object_index.name] groupby_keys = [grouping.name for grouping in self._groupby.grouper._groupings] result_index_names = groupby_keys + grouped_index_name @@ -2255,10 +2255,16 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: ------- GroupbyRollingIndexer """ - rolling_indexer: Union[Type[FixedWindowIndexer], Type[VariableWindowIndexer]] - if self.is_freq_type: + rolling_indexer: Type[BaseIndexer] + indexer_kwargs: Optional[Dict] = None + index_array = self.obj.index.asi8 + if isinstance(self.window, BaseIndexer): + rolling_indexer = type(self.window) + indexer_kwargs = self.window.__dict__ + # We'll be using the index of each group later + indexer_kwargs.pop("index_array", None) + elif self.is_freq_type: rolling_indexer = VariableWindowIndexer - index_array = self._groupby._selected_obj.index.asi8 else: rolling_indexer = FixedWindowIndexer index_array = None @@ -2267,6 +2273,7 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: window_size=window, groupby_indicies=self._groupby.indices, rolling_indexer=rolling_indexer, + indexer_kwargs=indexer_kwargs, ) return window_indexer @@ -2275,7 +2282,7 @@ def _gotitem(self, key, ndim, subset=None): # here so our index is carried thru to the selected obj # when we do the splitting for the groupby if self.on is not None: - self._groupby.obj = self._groupby.obj.set_index(self._on) + self.obj = self.obj.set_index(self._on) self.on = None return super()._gotitem(key, ndim, subset=subset) diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index 40bff5a75709b..d16955a98b62f 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -311,17 +311,17 @@ def init_windows_clipboard(): global HGLOBAL, LPVOID, DWORD, LPCSTR, INT global HWND, HINSTANCE, HMENU, BOOL, UINT, HANDLE from ctypes.wintypes import ( - HGLOBAL, - LPVOID, + BOOL, DWORD, - LPCSTR, - INT, - HWND, + HANDLE, + HGLOBAL, HINSTANCE, HMENU, - BOOL, + HWND, + INT, + LPCSTR, + LPVOID, UINT, - HANDLE, ) windll = ctypes.windll @@ -528,8 +528,8 @@ def determine_clipboard(): # Setup for the MAC OS X platform: if os.name == "mac" or platform.system() == "Darwin": try: - import Foundation # check if pyobjc is installed import AppKit + import Foundation # check if pyobjc is installed except ImportError: return init_osx_pbcopy_clipboard() else: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 2a12f779230b2..b1bbda4a4b7e0 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -834,8 +834,8 @@ class ExcelFile: from pandas.io.excel._odfreader import _ODFReader from pandas.io.excel._openpyxl import _OpenpyxlReader - from pandas.io.excel._xlrd import _XlrdReader from pandas.io.excel._pyxlsb import _PyxlsbReader + from pandas.io.excel._xlrd import _XlrdReader _engines = { "xlrd": _XlrdReader, diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 85ec9afaaec25..44abaf5d3b3c9 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -191,9 +191,9 @@ def _get_cell_string_value(self, cell) -> str: Find and decode OpenDocument text:s tags that represent a run length encoded sequence of space characters. """ - from odf.element import Text, Element - from odf.text import S, P + from odf.element import Element, Text from odf.namespaces import TEXTNS + from odf.text import P, S text_p = P().qname text_s = S().qname diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 0696d82e51f34..03a30cbd62f9a 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -225,7 +225,7 @@ def _convert_to_fill(cls, fill_dict): ------- fill : openpyxl.styles.Fill """ - from openpyxl.styles import PatternFill, GradientFill + from openpyxl.styles import GradientFill, PatternFill _pattern_fill_key_map = { "patternType": "fill_type", diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 8f7d3b1368fc7..af82c15fd6b66 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -48,11 +48,11 @@ def get_sheet_by_index(self, index): def get_sheet_data(self, sheet, convert_float): from xlrd import ( - xldate, + XL_CELL_BOOLEAN, XL_CELL_DATE, XL_CELL_ERROR, - XL_CELL_BOOLEAN, XL_CELL_NUMBER, + xldate, ) epoch1904 = self.book.datemode diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index fe85eab4bfbf5..c05f79f935548 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -72,7 +72,7 @@ from pandas.io.formats.printing import adjoin, justify, pprint_thing if TYPE_CHECKING: - from pandas import Series, DataFrame, Categorical + from pandas import Categorical, DataFrame, Series FormattersType = Union[ List[Callable], Tuple[Callable, ...], Mapping[Union[str, int], Callable] diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index d11144938eb26..3bbb5271bce61 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -42,8 +42,8 @@ try: - import matplotlib.pyplot as plt from matplotlib import colors + import matplotlib.pyplot as plt has_mpl = True except ImportError: @@ -390,16 +390,16 @@ def format_attr(pair): "is_visible": (c not in hidden_columns), } # only add an id if the cell has a style - if self.cell_ids or not (len(ctx[r, c]) == 1 and ctx[r, c][0] == ""): + props = [] + if self.cell_ids or (r, c) in ctx: row_dict["id"] = "_".join(cs[1:]) + for x in ctx[r, c]: + # have to handle empty styles like [''] + if x.count(":"): + props.append(tuple(x.split(":"))) + else: + props.append(("", "")) row_es.append(row_dict) - props = [] - for x in ctx[r, c]: - # have to handle empty styles like [''] - if x.count(":"): - props.append(tuple(x.split(":"))) - else: - props.append(("", "")) cellstyle_map[tuple(props)].append(f"row{r}_col{c}") body.append(row_es) diff --git a/pandas/io/html.py b/pandas/io/html.py index 3193f52d239f1..8354cf413814e 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -707,8 +707,8 @@ def _build_doc(self): -------- pandas.io.html._HtmlFrameParser._build_doc """ - from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError + from lxml.html import HTMLParser, fromstring, parse parser = HTMLParser(recover=True, encoding=self.encoding) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b67a1c5781d91..ac5d32c60dc5e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -57,7 +57,7 @@ from pandas.io.formats.printing import adjoin, pprint_thing if TYPE_CHECKING: - from tables import File, Node, Col # noqa:F401 + from tables import Col, File, Node # noqa:F401 # versioning attribute @@ -289,7 +289,15 @@ def read_hdf( Read from the store, close it if we opened it. Retrieve pandas object stored in file, optionally based on where - criteria + criteria. + + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. Parameters ---------- @@ -312,6 +320,10 @@ def read_hdf( mode : {'r', 'r+', 'a'}, default 'r' Mode to use when opening the file. Ignored if path_or_buf is a :class:`pandas.HDFStore`. Default is 'r'. + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. where : list, optional A list of Term (or convertible) objects. start : int, optional @@ -324,10 +336,6 @@ def read_hdf( Return an iterator object. chunksize : int, optional Number of rows to include in an iteration when using an iterator. - errors : str, default 'strict' - Specifies how encoding and decoding errors are to be handled. - See the errors argument for :func:`open` for a full list - of options. **kwargs Additional keyword arguments passed to HDFStore. @@ -445,6 +453,14 @@ class HDFStore: Either Fixed or Table format. + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- path : str @@ -789,6 +805,14 @@ def select( """ Retrieve pandas object stored in file, optionally based on where criteria. + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- key : str @@ -852,6 +876,15 @@ def select_as_coordinates( """ return the selection as an Index + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + + Parameters ---------- key : str @@ -876,6 +909,14 @@ def select_column( return a single column from the table. This is generally only useful to select an indexable + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- key : str @@ -912,6 +953,14 @@ def select_as_multiple( """ Retrieve pandas objects from multiple tables. + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- keys : a list of the tables diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 0038e39e2ffcc..17b41fd2b4379 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -1,8 +1,8 @@ # cython: profile=False # cython: boundscheck=False, initializedcheck=False from cython import Py_ssize_t - import numpy as np + import pandas.io.sas.sas_constants as const ctypedef signed long long int64_t diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 9177696ca13d6..c87391eaa62b1 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -937,7 +937,7 @@ def _get_column_names_and_types(self, dtype_mapper): return column_names_and_types def _create_table_setup(self): - from sqlalchemy import Table, Column, PrimaryKeyConstraint + from sqlalchemy import Column, PrimaryKeyConstraint, Table column_names_and_types = self._get_column_names_and_types(self._sqlalchemy_type) @@ -1026,15 +1026,15 @@ def _sqlalchemy_type(self, col): col_type = lib.infer_dtype(col, skipna=True) from sqlalchemy.types import ( + TIMESTAMP, BigInteger, - Integer, - Float, - Text, Boolean, - DateTime, Date, + DateTime, + Float, + Integer, + Text, Time, - TIMESTAMP, ) if col_type == "datetime64" or col_type == "datetime": @@ -1079,7 +1079,7 @@ def _sqlalchemy_type(self, col): return Text def _get_dtype(self, sqltype): - from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date, TIMESTAMP + from sqlalchemy.types import TIMESTAMP, Boolean, Date, DateTime, Float, Integer if isinstance(sqltype, Float): return float @@ -1374,7 +1374,7 @@ def to_sql( dtype = {col_name: dtype for col_name in frame} if dtype is not None: - from sqlalchemy.types import to_instance, TypeEngine + from sqlalchemy.types import TypeEngine, to_instance for col, my_type in dtype.items(): if not isinstance(to_instance(my_type), TypeEngine): diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 353bc8a8936a5..b490e07e43753 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1149,8 +1149,8 @@ def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds): @classmethod def _ts_plot(cls, ax, x, data, style=None, **kwds): from pandas.plotting._matplotlib.timeseries import ( - _maybe_resample, _decorate_axes, + _maybe_resample, format_dateaxis, ) diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 8f3571cf13cbc..95f9fbf3995ed 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -24,7 +24,7 @@ from pandas.tseries.frequencies import get_period_alias, is_subperiod, is_superperiod if TYPE_CHECKING: - from pandas import Series, Index # noqa:F401 + from pandas import Index, Series # noqa:F401 # --------------------------------------------------------------------- diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index ecd20796b6f21..caa348d3a1fb9 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -267,9 +267,10 @@ def test_sparsearray(): def test_np(): - import numpy as np import warnings + import numpy as np + with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) assert (pd.np.arange(0, 10) == np.arange(0, 10)).all() diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index d517eaaec68d2..0176755b54dd1 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -142,6 +142,7 @@ def test_repr(): @pyarrow_skip def test_arrow_extension_type(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType p1 = ArrowIntervalType(pa.int64(), "left") @@ -158,6 +159,7 @@ def test_arrow_extension_type(): @pyarrow_skip def test_arrow_array(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType intervals = pd.interval_range(1, 5, freq=1).array @@ -187,6 +189,7 @@ def test_arrow_array(): @pyarrow_skip def test_arrow_array_missing(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0]) @@ -221,6 +224,7 @@ def test_arrow_array_missing(): ) def test_arrow_table_roundtrip(breaks): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType arr = IntervalArray.from_breaks(breaks) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 6f9a1a5be4c43..efd5d29ae0717 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -206,12 +206,16 @@ def test_constructor_raises(): @pytest.mark.parametrize("copy", [True, False]) def test_from_sequence_no_mutate(copy): - a = np.array(["a", np.nan], dtype=object) - original = a.copy() - result = pd.arrays.StringArray._from_sequence(a, copy=copy) - expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object)) + nan_arr = np.array(["a", np.nan], dtype=object) + na_arr = np.array(["a", pd.NA], dtype=object) + + result = pd.arrays.StringArray._from_sequence(nan_arr, copy=copy) + expected = pd.arrays.StringArray(na_arr) + tm.assert_extension_array_equal(result, expected) - tm.assert_numpy_array_equal(a, original) + + expected = nan_arr if copy else na_arr + tm.assert_numpy_array_equal(nan_arr, expected) def test_astype_int(): diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 8887dd0278afe..0d81e8e733842 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -359,6 +359,7 @@ def test_arrow_extension_type(): ) def test_arrow_array(data, freq): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType periods = period_array(data, freq=freq) @@ -384,6 +385,7 @@ def test_arrow_array(data, freq): @pyarrow_skip def test_arrow_array_missing(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType arr = PeriodArray([1, 2, 3], freq="D") @@ -399,6 +401,7 @@ def test_arrow_array_missing(): @pyarrow_skip def test_arrow_table_roundtrip(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType arr = PeriodArray([1, 2, 3], freq="D") diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 3a32278e2a4b1..538978358c8e7 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -1522,3 +1522,22 @@ def test_apply_dtype(self, col): expected = df.dtypes tm.assert_series_equal(result, expected) + + +def test_apply_mutating(): + # GH#35462 case where applied func pins a new BlockManager to a row + df = pd.DataFrame({"a": range(100), "b": range(100, 200)}) + + def func(row): + mgr = row._mgr + row.loc["a"] += 1 + assert row._mgr is not mgr + return row + + expected = df.copy() + expected["a"] += 1 + + result = df.apply(func, axis=1) + + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, result) diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index 45f134a93a23a..0486fb2d588b6 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -214,3 +214,12 @@ def test_diff_integer_na(self, axis, expected): # Test case for default behaviour of diff result = df.diff(axis=axis) tm.assert_frame_equal(result, expected) + + def test_diff_readonly(self): + # https://github.com/pandas-dev/pandas/issues/35559 + arr = np.random.randn(5, 2) + arr.flags.writeable = False + df = pd.DataFrame(arr) + result = df.diff() + expected = pd.DataFrame(np.array(df)).diff() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index ddb5723e7bd3e..6b86a13fcf1b9 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -34,6 +34,14 @@ def test_interp_basic(self): expected.loc[5, "B"] = 9 tm.assert_frame_equal(result, expected) + def test_interp_empty(self): + # https://github.com/pandas-dev/pandas/issues/35598 + df = DataFrame() + result = df.interpolate() + assert result is not df + expected = df + tm.assert_frame_equal(result, expected) + def test_interp_bad_method(self): df = DataFrame( { diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index a3f056dbf9648..8603bff0587b6 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1573,3 +1573,11 @@ def test_replace_dict_category_type(self, input_category_df, expected_category_d result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) tm.assert_frame_equal(result, expected) + + def test_replace_with_compiled_regex(self): + # https://github.com/pandas-dev/pandas/issues/35680 + df = pd.DataFrame(["a", "b", "c"]) + regex = re.compile("^a$") + result = df.replace({regex: "z"}, regex=True) + expected = pd.DataFrame(["z", "b", "c"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index da4bfa9be4881..b88ef0e6691cb 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -318,3 +318,33 @@ def test_reset_index_dtypes_on_empty_frame_with_multiindex(array, dtype): result = DataFrame(index=idx)[:0].reset_index().dtypes expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) + + +def test_reset_index_empty_frame_with_datetime64_multiindex(): + # https://github.com/pandas-dev/pandas/issues/35606 + idx = MultiIndex( + levels=[[pd.Timestamp("2020-07-20 00:00:00")], [3, 4]], + codes=[[], []], + names=["a", "b"], + ) + df = DataFrame(index=idx, columns=["c", "d"]) + result = df.reset_index() + expected = DataFrame( + columns=list("abcd"), index=RangeIndex(start=0, stop=0, step=1) + ) + expected["a"] = expected["a"].astype("datetime64[ns]") + expected["b"] = expected["b"].astype("int64") + tm.assert_frame_equal(result, expected) + + +def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): + # https://github.com/pandas-dev/pandas/issues/35657 + df = DataFrame(dict(c1=[10.0], c2=["a"], c3=pd.to_datetime("2020-01-01"))) + df = df.head(0).groupby(["c2", "c3"])[["c1"]].sum() + result = df.reset_index() + expected = DataFrame( + columns=["c2", "c3", "c1"], index=RangeIndex(start=0, stop=0, step=1) + ) + expected["c3"] = expected["c3"].astype("datetime64[ns]") + expected["c1"] = expected["c1"].astype("float64") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 9ec029a6c4304..8f6902eca816f 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -145,6 +145,33 @@ def test_shift_duplicate_columns(self): tm.assert_frame_equal(shifted[0], shifted[1]) tm.assert_frame_equal(shifted[0], shifted[2]) + def test_shift_axis1_multiple_blocks(self): + # GH#35488 + df1 = pd.DataFrame(np.random.randint(1000, size=(5, 3))) + df2 = pd.DataFrame(np.random.randint(1000, size=(5, 2))) + df3 = pd.concat([df1, df2], axis=1) + assert len(df3._mgr.blocks) == 2 + + result = df3.shift(2, axis=1) + + expected = df3.take([-1, -1, 0, 1, 2], axis=1) + expected.iloc[:, :2] = np.nan + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + + # Case with periods < 0 + # rebuild df3 because `take` call above consolidated + df3 = pd.concat([df1, df2], axis=1) + assert len(df3._mgr.blocks) == 2 + result = df3.shift(-2, axis=1) + + expected = df3.take([2, 3, 4, -1, -1], axis=1) + expected.iloc[:, -2:] = np.nan + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") def test_tshift(self, datetime_frame): # TODO: remove this test when tshift deprecation is enforced diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 9d6b9f39a0578..52a1e3aae9058 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -287,7 +287,7 @@ def test_stat_op_api(self, float_frame, float_string_frame): assert_stat_op_api("median", float_frame, float_string_frame) try: - from scipy.stats import skew, kurtosis # noqa:F401 + from scipy.stats import kurtosis, skew # noqa:F401 assert_stat_op_api("skew", float_frame, float_string_frame) assert_stat_op_api("kurt", float_frame, float_string_frame) @@ -370,7 +370,7 @@ def kurt(x): ) try: - from scipy import skew, kurtosis # noqa:F401 + from scipy import kurtosis, skew # noqa:F401 assert_stat_op_calc("skew", skewness, float_frame_with_na) assert_stat_op_calc("kurt", kurt, float_frame_with_na) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 2b79fc8cd3406..cc57a3970d18b 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -367,6 +367,13 @@ def test_to_numpy_copy(self): assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is not arr + def test_to_numpy_mixed_dtype_to_str(self): + # https://github.com/pandas-dev/pandas/issues/35455 + df = pd.DataFrame([[pd.Timestamp("2020-01-01 00:00:00"), 100.0]]) + result = df.to_numpy(dtype=str) + expected = np.array([["2020-01-01 00:00:00", "100.0"]], dtype=str) + tm.assert_numpy_array_equal(result, expected) + def test_swapaxes(self): df = DataFrame(np.random.randn(10, 5)) tm.assert_frame_equal(df.T, df.swapaxes(0, 1)) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a4ed548264d39..b78bb1c492ef4 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2619,6 +2619,12 @@ class DatetimeSubclass(datetime): data = pd.DataFrame({"datetime": [DatetimeSubclass(2020, 1, 1, 1, 1)]}) assert data.datetime.dtype == "datetime64[ns]" + def test_with_mismatched_index_length_raises(self): + # GH#33437 + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + with pytest.raises(ValueError, match="Shape of passed values"): + DataFrame(dti, index=range(4)) + class TestDataFrameConstructorWithDatetimeTZ: def test_from_dict(self): diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 40a20c8210052..ce9d4b892d775 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1061,3 +1061,16 @@ def test_groupby_get_by_index(): res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])}) expected = pd.DataFrame(dict(A=["S", "W"], B=[1.0, 2.0])).set_index("A") pd.testing.assert_frame_equal(res, expected) + + +def test_nonagg_agg(): + # GH 35490 - Single/Multiple agg of non-agg function give same results + # TODO: agg should raise for functions that don't aggregate + df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]}) + g = df.groupby("a") + + result = g.agg(["cumsum"]) + result.columns = result.columns.droplevel(-1) + expected = g.agg("cumsum") + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 7e4513da37dc9..0d447a70b540d 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1294,9 +1294,7 @@ def test_get_nonexistent_category(): ) -def test_series_groupby_on_2_categoricals_unobserved( - reduction_func: str, observed: bool, request -): +def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed, request): # GH 17605 if reduction_func == "ngroup": pytest.skip("ngroup is not truly a reduction") @@ -1326,7 +1324,7 @@ def test_series_groupby_on_2_categoricals_unobserved( def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( - reduction_func: str, request + reduction_func, request ): # GH 17605 # Tests whether the unobserved categories in the result contain 0 or NaN @@ -1374,7 +1372,7 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( assert np.issubdtype(result.dtype, np.integer) -def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func: str): +def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func): # GH 23865 # GH 27075 # Ensure that df.groupby, when 'by' is two pd.Categorical variables, @@ -1402,7 +1400,7 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_fun @pytest.mark.parametrize("observed", [False, None]) def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( - reduction_func: str, observed: bool, request + reduction_func, observed, request ): # GH 23865 # GH 27075 diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 7f30a77872bc1..8af26eef504fc 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -478,3 +478,9 @@ def test_reindex_base(self): def test_map_str(self): # See test_map.py pass + + def test_format_different_scalar_lengths(self): + # GH35439 + idx = CategoricalIndex(["aaaaaaaaa", "b"]) + expected = ["aaaaaaaaa", "b"] + assert idx.format() == expected diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index c8b780455f862..3b41c4bfacf73 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -5,6 +5,7 @@ import pytest from pandas._libs import iNaT +from pandas.compat.numpy import _is_numpy_dev from pandas.errors import InvalidIndexError from pandas.core.dtypes.common import is_datetime64tz_dtype @@ -417,7 +418,7 @@ def test_set_ops_error_cases(self, case, method, index): with pytest.raises(TypeError, match=msg): getattr(index, method)(case) - def test_intersection_base(self, index): + def test_intersection_base(self, index, request): if isinstance(index, CategoricalIndex): return @@ -434,6 +435,15 @@ def test_intersection_base(self, index): # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: + # https://github.com/pandas-dev/pandas/issues/35481 + if ( + _is_numpy_dev + and isinstance(case, Series) + and isinstance(index, UInt64Index) + ): + mark = pytest.mark.xfail(reason="gh-35481") + request.node.add_marker(mark) + result = first.intersection(case) assert tm.equalContents(result, second) @@ -632,6 +642,12 @@ def test_equals_op(self): tm.assert_numpy_array_equal(index_a == item, expected3) tm.assert_series_equal(series_a == item, Series(expected3)) + def test_format(self): + # GH35439 + idx = self.create_index() + expected = [str(x) for x in idx] + assert idx.format() == expected + def test_hasnans_isnans(self, index): # GH 11343, added tests for hasnans / isnans if isinstance(index, MultiIndex): diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index ec4162f87010f..7bb1d98086a91 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -59,6 +59,7 @@ def test_reindex_with_same_tz(self): def test_time_loc(self): # GH8667 from datetime import time + from pandas._libs.index import _SIZE_CUTOFF ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64) diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index 7345ae3032463..a5abf2946feda 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -20,6 +20,12 @@ def index(self, request): def create_index(self) -> DatetimeIndex: return date_range("20130101", periods=5) + def test_format(self): + # GH35439 + idx = self.create_index() + expected = [f"{x:%Y-%m-%d}" for x in idx] + assert idx.format() == expected + def test_shift(self): pass # handled in test_ops diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index eaf48421dc071..59ee88117a984 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1171,8 +1171,11 @@ def test_summary_bug(self): assert "~:{range}:0" in result assert "{other}%s" in result - def test_format(self, index): - self._check_method_works(Index.format, index) + def test_format_different_scalar_lengths(self): + # GH35439 + idx = Index(["aaaaaaaaa", "b"]) + expected = ["aaaaaaaaa", "b"] + assert idx.format() == expected def test_format_bug(self): # GH 14626 diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index a7c5734ef9b02..bfcac5d433d2c 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -21,6 +21,13 @@ def test_can_hold_identifiers(self): key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is False + def test_format(self): + # GH35439 + idx = self.create_index() + max_width = max(len(str(x)) for x in idx) + expected = [str(x).ljust(max_width) for x in idx] + assert idx.format() == expected + def test_numeric_compat(self): pass # override Base method diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index be193e0854d8d..d8e56661b7d61 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -15,7 +15,7 @@ def test_multiindex_get_loc(): # GH7724, GH2646 with warnings.catch_warnings(record=True): # test indexing into a multi-index before & past the lexsort depth - from numpy.random import randint, choice, randn + from numpy.random import choice, randint, randn cols = ["jim", "joe", "jolie", "joline", "jolia"] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 30b13b6ea9fce..193800fae751f 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.compat.numpy import _is_numpy_dev + import pandas as pd from pandas import DataFrame, Series, Timestamp, date_range import pandas._testing as tm @@ -945,6 +947,7 @@ def test_loc_setitem_empty_append(self): df.loc[0, "x"] = expected.loc[0, "x"] tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(_is_numpy_dev, reason="gh-35481") def test_loc_setitem_empty_append_raises(self): # GH6173, various appends to an empty dataframe diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index e236b3da73c69..84805d06df4a8 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2141,6 +2141,15 @@ def test_dict_entries(self): assert "'a': 1" in val assert "'b': 2" in val + def test_categorical_columns(self): + # GH35439 + data = [[4, 2], [3, 2], [4, 3]] + cols = ["aaaaaaaaa", "b"] + df = pd.DataFrame(data, columns=cols) + df_cat_cols = pd.DataFrame(data, columns=pd.CategoricalIndex(cols)) + + assert df.to_string() == df_cat_cols.to_string() + def test_period(self): # GH 12615 df = pd.DataFrame( diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 9c6910637fa7e..6025649e9dbec 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -1682,6 +1682,15 @@ def f(a, b, styler): result = styler.pipe((f, "styler"), a=1, b=2) assert result == (1, 2, styler) + def test_no_cell_ids(self): + # GH 35588 + # GH 35663 + df = pd.DataFrame(data=[[0]]) + styler = Styler(df, uuid="_", cell_ids=False) + styler.render() + s = styler.render() # render twice to ensure ctx is not updated + assert s.find('') != -1 + @td.skip_if_no_mpl class TestStylerMatplotlibDep: diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 12e73bae40eac..3d5f6ae3a4af9 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -18,7 +18,7 @@ from pandas.errors import DtypeWarning, EmptyDataError, ParserError import pandas.util._test_decorators as td -from pandas import DataFrame, Index, MultiIndex, Series, compat, concat +from pandas import DataFrame, Index, MultiIndex, Series, compat, concat, option_context import pandas._testing as tm from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser @@ -1138,6 +1138,7 @@ def test_parse_integers_above_fp_precision(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(reason="ResourceWarning #35660", strict=False) def test_chunks_have_consistent_numerical_type(all_parsers): parser = all_parsers integers = [str(i) for i in range(499999)] @@ -1151,6 +1152,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers): assert result.a.dtype == float +@pytest.mark.xfail(reason="ResourceWarning #35660", strict=False) def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers @@ -2179,3 +2181,13 @@ def test_read_csv_names_not_accepting_sets(all_parsers): parser = all_parsers with pytest.raises(ValueError, match="Names should be an ordered collection."): parser.read_csv(StringIO(data), names=set("QAZ")) + + +def test_read_csv_with_use_inf_as_na(all_parsers): + # https://github.com/pandas-dev/pandas/issues/35493 + parser = all_parsers + data = "1.0\nNaN\n3.0" + with option_context("use_inf_as_na", True): + result = parser.read_csv(StringIO(data), header=None) + expected = DataFrame([1.0, np.nan, 3.0]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index c397a61616c1c..d64e2d1933ace 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -37,8 +37,8 @@ def test_read_csv(cleared_fs): def test_reasonable_error(monkeypatch, cleared_fs): - from fsspec.registry import known_implementations from fsspec import registry + from fsspec.registry import known_implementations registry.target.clear() with pytest.raises(ValueError) as e: diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 4d93119ffa3f5..eacf4fa08545d 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -11,8 +11,7 @@ @td.skip_if_no("gcsfs") def test_read_csv_gcs(monkeypatch): - from fsspec import AbstractFileSystem - from fsspec import registry + from fsspec import AbstractFileSystem, registry registry.target.clear() # noqa # remove state @@ -37,8 +36,7 @@ def open(*args, **kwargs): @td.skip_if_no("gcsfs") def test_to_csv_gcs(monkeypatch): - from fsspec import AbstractFileSystem - from fsspec import registry + from fsspec import AbstractFileSystem, registry registry.target.clear() # noqa # remove state df1 = DataFrame( @@ -76,8 +74,7 @@ def mock_get_filepath_or_buffer(*args, **kwargs): @td.skip_if_no("gcsfs") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" - from fsspec import AbstractFileSystem - from fsspec import registry + from fsspec import AbstractFileSystem, registry registry.target.clear() # noqa # remove state df1 = DataFrame( diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 0991fae39138e..29b787d39c09d 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -48,10 +48,10 @@ try: import sqlalchemy - import sqlalchemy.schema - import sqlalchemy.sql.sqltypes as sqltypes from sqlalchemy.ext import declarative from sqlalchemy.orm import session as sa_session + import sqlalchemy.schema + import sqlalchemy.sql.sqltypes as sqltypes SQLALCHEMY_INSTALLED = True except ImportError: diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 896d3278cdde1..3b1ff233c5ec1 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -13,7 +13,6 @@ from pandas import DataFrame, Series import pandas._testing as tm - """ This is a common base class used for various plotting tests """ @@ -24,6 +23,7 @@ class TestPlotBase: def setup_method(self, method): import matplotlib as mpl + from pandas.plotting._matplotlib import compat mpl.rcdefaults() @@ -187,8 +187,8 @@ def _check_colors( Series used for color grouping key used for andrew_curves, parallel_coordinates, radviz test """ + from matplotlib.collections import Collection, LineCollection, PolyCollection from matplotlib.lines import Line2D - from matplotlib.collections import Collection, PolyCollection, LineCollection conv = self.colorconverter if linecolors is not None: diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 317a994bd9a32..ee43e5d7072fe 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -2408,8 +2408,8 @@ def test_specified_props_kwd_plot_box(self, props, expected): assert result[expected][0].get_color() == "C1" def test_default_color_cycle(self): - import matplotlib.pyplot as plt import cycler + import matplotlib.pyplot as plt colors = list("rgbk") plt.rcParams["axes.prop_cycle"] = cycler.cycler("color", colors) @@ -2953,8 +2953,8 @@ def _check(axes): @td.skip_if_no_scipy def test_memory_leak(self): """ Check that every plot type gets properly collected. """ - import weakref import gc + import weakref results = {} for kind in plotting.PlotAccessor._all_kinds: @@ -3032,8 +3032,8 @@ def test_df_subplots_patterns_minorticks(self): @pytest.mark.slow def test_df_gridspec_patterns(self): # GH 10819 - import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec + import matplotlib.pyplot as plt ts = Series(np.random.randn(10), index=date_range("1/1/2000", periods=10)) @@ -3422,9 +3422,9 @@ def test_xlabel_ylabel_dataframe_subplots( def _generate_4_axes_via_gridspec(): - import matplotlib.pyplot as plt import matplotlib as mpl import matplotlib.gridspec # noqa + import matplotlib.pyplot as plt gs = mpl.gridspec.GridSpec(2, 2) ax_tl = plt.subplot(gs[0, 0]) diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index b6a6c326c3df3..34c881855d16a 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -101,7 +101,7 @@ def test_hist_layout_with_by(self): @pytest.mark.slow def test_hist_no_overlap(self): - from matplotlib.pyplot import subplot, gcf + from matplotlib.pyplot import gcf, subplot x = Series(randn(2)) y = Series(randn(2)) @@ -352,6 +352,7 @@ class TestDataFrameGroupByPlots(TestPlotBase): @pytest.mark.slow def test_grouped_hist_legacy(self): from matplotlib.patches import Rectangle + from pandas.plotting._matplotlib.hist import _grouped_hist df = DataFrame(randn(500, 2), columns=["A", "B"]) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 75eeede472fe9..f5c1c58f3f7ed 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -131,9 +131,10 @@ def test_scatter_matrix_axis(self): @pytest.mark.slow def test_andrews_curves(self, iris): - from pandas.plotting import andrews_curves from matplotlib import cm + from pandas.plotting import andrews_curves + df = iris _check_plot_works(andrews_curves, frame=df, class_column="Name") @@ -206,9 +207,10 @@ def test_andrews_curves(self, iris): @pytest.mark.slow def test_parallel_coordinates(self, iris): - from pandas.plotting import parallel_coordinates from matplotlib import cm + from pandas.plotting import parallel_coordinates + df = iris ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name") @@ -279,9 +281,10 @@ def test_parallel_coordinates_with_sorted_labels(self): @pytest.mark.slow def test_radviz(self, iris): - from pandas.plotting import radviz from matplotlib import cm + from pandas.plotting import radviz + df = iris _check_plot_works(radviz, frame=df, class_column="Name") @@ -397,6 +400,7 @@ def test_get_standard_colors_no_appending(self): # Make sure not to add more colors so that matplotlib can cycle # correctly. from matplotlib import cm + from pandas.plotting._matplotlib.style import _get_standard_colors color_before = cm.gnuplot(range(5)) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 151bb3bed7207..cc00626e992f3 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -452,7 +452,7 @@ def test_hist_layout_with_by(self): @pytest.mark.slow def test_hist_no_overlap(self): - from matplotlib.pyplot import subplot, gcf + from matplotlib.pyplot import gcf, subplot x = Series(randn(2)) y = Series(randn(2)) @@ -827,6 +827,7 @@ def test_standard_colors(self): @pytest.mark.slow def test_standard_colors_all(self): import matplotlib.colors as colors + from pandas.plotting._matplotlib.style import _get_standard_colors # multiple colors like mediumaquamarine diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 9b09f0033715d..895de2b748c34 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1339,3 +1339,25 @@ def test_merge_index_column_tz(self): index=pd.Index([0, 1, 2, 3, 4]), ) tm.assert_frame_equal(result, expected) + + def test_left_index_right_index_tolerance(self): + # https://github.com/pandas-dev/pandas/issues/35558 + dr1 = pd.date_range( + start="1/1/2020", end="1/20/2020", freq="2D" + ) + pd.Timedelta(seconds=0.4) + dr2 = pd.date_range(start="1/1/2020", end="2/1/2020") + + df1 = pd.DataFrame({"val1": "foo"}, index=pd.DatetimeIndex(dr1)) + df2 = pd.DataFrame({"val2": "bar"}, index=pd.DatetimeIndex(dr2)) + + expected = pd.DataFrame( + {"val1": "foo", "val2": "bar"}, index=pd.DatetimeIndex(dr1) + ) + result = pd.merge_asof( + df1, + df2, + left_index=True, + right_index=True, + tolerance=pd.Timedelta(seconds=0.5), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 0b34fab7b80b1..088f8681feb99 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -11,7 +11,6 @@ from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range import pandas._testing as tm - """ Also test support for datetime64[ns] in Series / DataFrame """ @@ -166,6 +165,7 @@ def test_getitem_setitem_datetime_tz_pytz(): def test_getitem_setitem_datetime_tz_dateutil(): from dateutil.tz import tzutc + from pandas._libs.tslibs.timezones import dateutil_gettz as gettz tz = ( diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py index 19caf4eccf748..4b4ef5ea046be 100644 --- a/pandas/tests/series/methods/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -90,7 +90,7 @@ def test_with_nan(self): tm.assert_series_equal(result, expected) def test_periodindex(self): - from pandas import period_range, PeriodIndex + from pandas import PeriodIndex, period_range # array or list or dates N = 50 diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 11802c59a29da..f78a28c66e946 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -415,3 +417,11 @@ def test_replace_extension_other(self): # https://github.com/pandas-dev/pandas/issues/34530 ser = pd.Series(pd.array([1, 2, 3], dtype="Int64")) ser.replace("", "") # no exception + + def test_replace_with_compiled_regex(self): + # https://github.com/pandas-dev/pandas/issues/35680 + s = pd.Series(["a", "b", "c"]) + regex = re.compile("^a$") + result = s.replace({regex: "z"}, regex=True) + expected = pd.Series(["z", "b", "c"]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py index 7c82edbaec177..45592f8d99b93 100644 --- a/pandas/tests/series/methods/test_truncate.py +++ b/pandas/tests/series/methods/test_truncate.py @@ -141,3 +141,14 @@ def test_truncate_multiindex(self): expected = df.col tm.assert_series_equal(result, expected) + + def test_truncate_one_element_series(self): + # GH 35544 + series = pd.Series([0.1], index=pd.DatetimeIndex(["2020-08-04"])) + before = pd.Timestamp("2020-08-02") + after = pd.Timestamp("2020-08-04") + + result = series.truncate(before=before, after=after) + + # the input Series and the expected Series are the same + tm.assert_series_equal(result, series) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 5c8a0d224c4f9..ef2bafd4ea2ad 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -195,8 +195,8 @@ def test_add_with_duplicate_index(self): tm.assert_series_equal(result, expected) def test_add_na_handling(self): - from decimal import Decimal from datetime import date + from decimal import Decimal s = Series( [Decimal("1.3"), Decimal("2.3")], index=[date(2012, 1, 1), date(2012, 1, 2)] diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index e718a6b759963..b32c5e91af295 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -90,7 +90,7 @@ def test_statsmodels(): def test_scikit_learn(df): sklearn = import_module("sklearn") # noqa - from sklearn import svm, datasets + from sklearn import datasets, svm digits = datasets.load_digits() clf = svm.SVC(gamma=0.001, C=100.0) diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index 1e193f22a6698..f68d83f7f4d58 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -166,3 +166,16 @@ def test_to_timedelta_ignore_strings_unit(self): arr = np.array([1, 2, "error"], dtype=object) result = pd.to_timedelta(arr, unit="ns", errors="ignore") tm.assert_numpy_array_equal(result, arr) + + def test_to_timedelta_nullable_int64_dtype(self): + # GH 35574 + expected = Series([timedelta(days=1), timedelta(days=2)]) + result = to_timedelta(Series([1, 2], dtype="Int64"), unit="days") + + tm.assert_series_equal(result, expected) + + # IntegerArray Series with nulls + expected = Series([timedelta(days=1), None]) + result = to_timedelta(Series([1, None], dtype="Int64"), unit="days") + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py index d9fdf1491c328..f9259beab5d13 100644 --- a/pandas/tests/util/test_assert_extension_array_equal.py +++ b/pandas/tests/util/test_assert_extension_array_equal.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas import array import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray @@ -102,3 +103,11 @@ def test_assert_extension_array_equal_non_extension_array(side): with pytest.raises(AssertionError, match=msg): tm.assert_extension_array_equal(*args) + + +@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) +def test_assert_extension_array_equal_ignore_dtype_mismatch(right_dtype): + # https://github.com/pandas-dev/pandas/issues/35715 + left = array([1, 2, 3], dtype="Int64") + right = array([1, 2, 3], dtype=right_dtype) + tm.assert_extension_array_equal(left, right, check_dtype=False) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index fe3e1ff906919..3aa3c64923b14 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -260,3 +260,11 @@ def test_assert_frame_equal_interval_dtype_mismatch(): with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(left, right, check_dtype=True) + + +@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) +def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype): + # https://github.com/pandas-dev/pandas/issues/35715 + left = pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64") + right = pd.DataFrame({"a": [1, 2, 3]}, dtype=right_dtype) + tm.assert_frame_equal(left, right, check_dtype=False) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 1284cc9d4f49b..f3c66052b1904 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -281,3 +281,26 @@ class MySeries(Series): with pytest.raises(AssertionError, match="Series classes are different"): tm.assert_series_equal(s3, s1, check_series_type=True) + + +def test_series_equal_exact_for_nonnumeric(): + # https://github.com/pandas-dev/pandas/issues/35446 + s1 = Series(["a", "b"]) + s2 = Series(["a", "b"]) + s3 = Series(["b", "a"]) + + tm.assert_series_equal(s1, s2, check_exact=True) + tm.assert_series_equal(s2, s1, check_exact=True) + + with pytest.raises(AssertionError): + tm.assert_series_equal(s1, s3, check_exact=True) + with pytest.raises(AssertionError): + tm.assert_series_equal(s3, s1, check_exact=True) + + +@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) +def test_assert_series_equal_ignore_extension_dtype_mismatch(right_dtype): + # https://github.com/pandas-dev/pandas/issues/35715 + left = pd.Series([1, 2, 3], dtype="Int64") + right = pd.Series([1, 2, 3], dtype=right_dtype) + tm.assert_series_equal(left, right, check_dtype=False) diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 4a0212e890d3a..2300d8dd5529b 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -140,7 +140,7 @@ def get_window_bounds(self, num_values, min_periods, center, closed): ) def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs): # GH 32865 - values = np.arange(10) + values = np.arange(10.0) values[5] = 100.0 indexer = FixedForwardWindowIndexer(window_size=3) @@ -177,7 +177,7 @@ def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs) @pytest.mark.parametrize("constructor", [Series, DataFrame]) def test_rolling_forward_skewness(constructor): - values = np.arange(10) + values = np.arange(10.0) values[5] = 100.0 indexer = FixedForwardWindowIndexer(window_size=5) diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 12c314d5e9ec9..69cd1d1ba069c 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -108,7 +108,7 @@ def test_ewma_halflife_without_times(halflife_with_times): @pytest.mark.parametrize("min_periods", [0, 2]) def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods): halflife = halflife_with_times - data = np.arange(10) + data = np.arange(10.0) data[::2] = np.nan df = DataFrame({"A": data, "time_col": date_range("2000", freq="D", periods=10)}) result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean() diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 744ca264e91d9..d0a62374d0888 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -214,3 +214,172 @@ def foo(x): name="value", ) tm.assert_series_equal(result, expected) + + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") + def test_groupby_rolling_center_center(self): + # GH 35552 + series = Series(range(1, 6)) + result = series.groupby(series).rolling(center=True, window=3).mean() + expected = Series( + [np.nan] * 5, + index=pd.MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3), (5, 4))), + ) + tm.assert_series_equal(result, expected) + + series = Series(range(1, 5)) + result = series.groupby(series).rolling(center=True, window=3).mean() + expected = Series( + [np.nan] * 4, + index=pd.MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3))), + ) + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 6, "b": range(11)}) + result = df.groupby("a").rolling(center=True, window=3).mean() + expected = pd.DataFrame( + [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, 9, np.nan], + index=pd.MultiIndex.from_tuples( + ( + ("a", 0), + ("a", 1), + ("a", 2), + ("a", 3), + ("a", 4), + ("b", 5), + ("b", 6), + ("b", 7), + ("b", 8), + ("b", 9), + ("b", 10), + ), + names=["a", None], + ), + columns=["b"], + ) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 5, "b": range(10)}) + result = df.groupby("a").rolling(center=True, window=3).mean() + expected = pd.DataFrame( + [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, np.nan], + index=pd.MultiIndex.from_tuples( + ( + ("a", 0), + ("a", 1), + ("a", 2), + ("a", 3), + ("a", 4), + ("b", 5), + ("b", 6), + ("b", 7), + ("b", 8), + ("b", 9), + ), + names=["a", None], + ), + columns=["b"], + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") + def test_groupby_subselect_rolling(self): + # GH 35486 + df = DataFrame( + {"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0], "c": [10, 20, 30, 20]} + ) + result = df.groupby("a")[["b"]].rolling(2).max() + expected = DataFrame( + [np.nan, np.nan, 2.0, np.nan], + columns=["b"], + index=pd.MultiIndex.from_tuples( + ((1, 0), (2, 1), (2, 3), (3, 2)), names=["a", None] + ), + ) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].rolling(2).max() + expected = Series( + [np.nan, np.nan, 2.0, np.nan], + index=pd.MultiIndex.from_tuples( + ((1, 0), (2, 1), (2, 3), (3, 2)), names=["a", None] + ), + name="b", + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") + def test_groupby_rolling_custom_indexer(self): + # GH 35557 + class SimpleIndexer(pd.api.indexers.BaseIndexer): + def get_window_bounds( + self, num_values=0, min_periods=None, center=None, closed=None + ): + min_periods = self.window_size if min_periods is None else 0 + end = np.arange(num_values, dtype=np.int64) + 1 + start = end.copy() - self.window_size + start[start < 0] = min_periods + return start, end + + df = pd.DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0, 5.0] * 3}, index=[0] * 5 + [1] * 5 + [2] * 5 + ) + result = ( + df.groupby(df.index) + .rolling(SimpleIndexer(window_size=3), min_periods=1) + .sum() + ) + expected = df.groupby(df.index).rolling(window=3, min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") + def test_groupby_rolling_subset_with_closed(self): + # GH 35549 + df = pd.DataFrame( + { + "column1": range(6), + "column2": range(6), + "group": 3 * ["A", "B"], + "date": [pd.Timestamp("2019-01-01")] * 6, + } + ) + result = ( + df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum() + ) + expected = Series( + [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], + index=pd.MultiIndex.from_tuples( + [("A", pd.Timestamp("2019-01-01"))] * 3 + + [("B", pd.Timestamp("2019-01-01"))] * 3, + names=["group", "date"], + ), + name="column1", + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") + def test_groupby_subset_rolling_subset_with_closed(self): + # GH 35549 + df = pd.DataFrame( + { + "column1": range(6), + "column2": range(6), + "group": 3 * ["A", "B"], + "date": [pd.Timestamp("2019-01-01")] * 6, + } + ) + + result = ( + df.groupby("group")[["column1", "date"]] + .rolling("1D", on="date", closed="left")["column1"] + .sum() + ) + expected = Series( + [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], + index=pd.MultiIndex.from_tuples( + [("A", pd.Timestamp("2019-01-01"))] * 3 + + [("B", pd.Timestamp("2019-01-01"))] * 3, + names=["group", "date"], + ), + name="column1", + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py index f413490764124..3a8a1a3144269 100644 --- a/pandas/util/_doctools.py +++ b/pandas/util/_doctools.py @@ -53,8 +53,8 @@ def plot(self, left, right, labels=None, vertical: bool = True): vertical : bool, default True If True, use vertical layout. If False, use horizontal layout. """ - import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec + import matplotlib.pyplot as plt if not isinstance(left, list): left = [left] diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index a4a1d83177c50..bdf633839b2cd 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -120,7 +120,9 @@ def _skip_if_no_scipy() -> bool: ) -def skip_if_installed(package: str) -> Callable: +# TODO: return type, _pytest.mark.structures.MarkDecorator is not public +# https://github.com/pytest-dev/pytest/issues/7469 +def skip_if_installed(package: str): """ Skip a test if a package is installed. @@ -134,7 +136,9 @@ def skip_if_installed(package: str) -> Callable: ) -def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: +# TODO: return type, _pytest.mark.structures.MarkDecorator is not public +# https://github.com/pytest-dev/pytest/issues/7469 +def skip_if_no(package: str, min_version: Optional[str] = None): """ Generic function to help skip tests when required packages are not present on the testing system. @@ -196,14 +200,12 @@ def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: ) -def skip_if_np_lt( - ver_str: str, reason: Optional[str] = None, *args, **kwds -) -> Callable: +# TODO: return type, _pytest.mark.structures.MarkDecorator is not public +# https://github.com/pytest-dev/pytest/issues/7469 +def skip_if_np_lt(ver_str: str, *args, reason: Optional[str] = None): if reason is None: reason = f"NumPy {ver_str} or greater required" - return pytest.mark.skipif( - _np_version < LooseVersion(ver_str), reason=reason, *args, **kwds - ) + return pytest.mark.skipif(_np_version < LooseVersion(ver_str), *args, reason=reason) def parametrize_fixture_doc(*args): diff --git a/requirements-dev.txt b/requirements-dev.txt index 0c024d1b54637..c0dd77cd73ddc 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -11,7 +11,7 @@ cpplint flake8<3.8.0 flake8-comprehensions>=3.1.0 flake8-rst>=0.6.0,<=0.7.0 -isort==4.3.21 +isort>=5.2.1 mypy==0.730 pycodestyle gitpython @@ -32,7 +32,7 @@ boto3 botocore>=1.11 hypothesis>=3.82 moto -pytest>=5.0.1,<6.0.0rc0 +pytest>=5.0.1 pytest-cov pytest-xdist>=1.21 pytest-asyncio diff --git a/setup.cfg b/setup.cfg index 00af7f6f1b79a..ee5725e36d193 100644 --- a/setup.cfg +++ b/setup.cfg @@ -105,7 +105,7 @@ known_dtypes = pandas.core.dtypes known_post_core = pandas.tseries,pandas.io,pandas.plotting sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER known_first_party = pandas -known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,validate_unwanted_patterns,yaml,odf +known_third_party = announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,validate_unwanted_patterns,yaml,odf multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 diff --git a/setup.py b/setup.py index aebbdbf4d1e96..22da02360619e 100755 --- a/setup.py +++ b/setup.py @@ -457,6 +457,9 @@ def run(self): if sys.version_info[:2] == (3, 8): # GH 33239 extra_compile_args.append("-Wno-error=deprecated-declarations") + # https://github.com/pandas-dev/pandas/issues/35559 + extra_compile_args.append("-Wno-error=unreachable-code") + # enable coverage by building cython files by setting the environment variable # "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext # with `--with-cython-coverage`enabled