diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000000000..6133037bf3b7d --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,22 @@ +version: 2.1 + +jobs: + test-arm: + machine: + image: ubuntu-2004:202101-01 + resource_class: arm.large + environment: + ENV_FILE: ci/deps/circle-38-arm64.yaml + PYTEST_WORKERS: auto + PATTERN: "not single_cpu and not slow and not network and not clipboard and not arm_slow and not db" + PYTEST_TARGET: "pandas" + PANDAS_CI: "1" + steps: + - checkout + - run: .circleci/setup_env.sh + - run: PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH ci/run_tests.sh + +workflows: + test: + jobs: + - test-arm diff --git a/.circleci/setup_env.sh b/.circleci/setup_env.sh new file mode 100755 index 0000000000000..c03a7ff4be8b3 --- /dev/null +++ b/.circleci/setup_env.sh @@ -0,0 +1,106 @@ +#!/bin/bash -e + +# edit the locale file if needed +if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then + echo "Adding locale to the first line of pandas/__init__.py" + rm -f pandas/__init__.pyc + SEDC="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LC_ALL')\n" + sed -i "$SEDC" pandas/__init__.py + + echo "[head -4 pandas/__init__.py]" + head -4 pandas/__init__.py + echo +fi + + +MINICONDA_DIR=/usr/local/miniconda +if [ -e $MINICONDA_DIR ] && [ "$BITS32" != yes ]; then + echo "Found Miniconda installation at $MINICONDA_DIR" +else + echo "Install Miniconda" + DEFAULT_CONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest" + if [[ "$(uname -m)" == 'aarch64' ]]; then + CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.10.1-4/Miniforge3-4.10.1-4-Linux-aarch64.sh" + elif [[ "$(uname)" == 'Linux' ]]; then + if [[ "$BITS32" == "yes" ]]; then + CONDA_URL="$DEFAULT_CONDA_URL-Linux-x86.sh" + else + CONDA_URL="$DEFAULT_CONDA_URL-Linux-x86_64.sh" + fi + elif [[ "$(uname)" == 'Darwin' ]]; then + CONDA_URL="$DEFAULT_CONDA_URL-MacOSX-x86_64.sh" + else + echo "OS $(uname) not supported" + exit 1 + fi + echo "Downloading $CONDA_URL" + wget -q $CONDA_URL -O miniconda.sh + chmod +x miniconda.sh + + MINICONDA_DIR="$HOME/miniconda3" + rm -rf $MINICONDA_DIR + ./miniconda.sh -b -p $MINICONDA_DIR +fi +export PATH=$MINICONDA_DIR/bin:$PATH + +echo +echo "which conda" +which conda + +echo +echo "update conda" +conda config --set ssl_verify false +conda config --set quiet true --set always_yes true --set changeps1 false +conda install -y -c conda-forge -n base 'mamba>=0.21.2' pip setuptools + +echo "conda info -a" +conda info -a + +echo "conda list (root environment)" +conda list + +echo +# Clean up any left-over from a previous build +mamba env remove -n pandas-dev +echo "mamba env update --file=${ENV_FILE}" +# See https://github.com/mamba-org/mamba/issues/633 +mamba create -q -n pandas-dev +time mamba env update -n pandas-dev --file="${ENV_FILE}" + +echo "conda list -n pandas-dev" +conda list -n pandas-dev + +if [[ "$BITS32" == "yes" ]]; then + # activate 32-bit compiler + export CONDA_BUILD=1 +fi + +echo "activate pandas-dev" +source activate pandas-dev + +# Explicitly set an environment variable indicating that this is pandas' CI environment. +# +# This allows us to enable things like -Werror that shouldn't be activated in +# downstream CI jobs that may also build pandas from source. +export PANDAS_CI=1 + +if pip list | grep -q ^pandas; then + echo + echo "remove any installed pandas package w/o removing anything else" + pip uninstall -y pandas || true +fi + +if [ "$(conda list -f qt --json)" != [] ]; then + echo + echo "remove qt" + echo "causes problems with the clipboard, we use xsel for that" + conda remove qt -y --force || true +fi + +echo "Build extensions" +python setup.py build_ext -q -j3 + +echo "Install pandas" +python -m pip install --no-build-isolation --no-use-pep517 -e . + +echo "done" diff --git a/.devcontainer.json b/.devcontainer.json index 8bea96aea29c1..7c5d009260c64 100644 --- a/.devcontainer.json +++ b/.devcontainer.json @@ -9,8 +9,7 @@ // You can edit these settings after create using File > Preferences > Settings > Remote. "settings": { "terminal.integrated.shell.linux": "/bin/bash", - "python.condaPath": "/opt/conda/bin/conda", - "python.pythonPath": "/opt/conda/bin/python", + "python.pythonPath": "/usr/local/bin/python", "python.formatting.provider": "black", "python.linting.enabled": true, "python.linting.flake8Enabled": true, diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md deleted file mode 100644 index 87a5b7905fc6d..0000000000000 --- a/.github/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,62 +0,0 @@ -# Contributor Code of Conduct - -As contributors and maintainers of this project, and in the interest of -fostering an open and welcoming community, we pledge to respect all people who -contribute through reporting issues, posting feature requests, updating -documentation, submitting pull requests or patches, and other activities. - -We are committed to making participation in this project a harassment-free -experience for everyone, regardless of level of experience, gender, gender -identity and expression, sexual orientation, disability, personal appearance, -body size, race, ethnicity, age, religion, or nationality. - -Examples of unacceptable behavior by participants include: - -* The use of sexualized language or imagery -* Personal attacks -* Trolling or insulting/derogatory comments -* Public or private harassment -* Publishing other's private information, such as physical or electronic - addresses, without explicit permission -* Other unethical or unprofessional conduct - -Project maintainers have the right and responsibility to remove, edit, or -reject comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct, or to ban temporarily or -permanently any contributor for other behaviors that they deem inappropriate, -threatening, offensive, or harmful. - -By adopting this Code of Conduct, project maintainers commit themselves to -fairly and consistently applying these principles to every aspect of managing -this project. Project maintainers who do not follow or enforce the Code of -Conduct may be permanently removed from the project team. - -This Code of Conduct applies both within project spaces and in public spaces -when an individual is representing the project or its community. - -A working group of community members is committed to promptly addressing any -reported issues. The working group is made up of pandas contributors and users. -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the working group by e-mail (pandas-coc@googlegroups.com). -Messages sent to this e-mail address will not be publicly visible but only to -the working group members. The working group currently includes - -- Safia Abdalla -- Tom Augspurger -- Joris Van den Bossche -- Camille Scott -- Nathaniel Smith - -All complaints will be reviewed and investigated and will result in a response -that is deemed necessary and appropriate to the circumstances. Maintainers are -obligated to maintain confidentiality with regard to the reporter of an -incident. - -This Code of Conduct is adapted from the [Contributor Covenant][homepage], -version 1.3.0, available at -[https://www.contributor-covenant.org/version/1/3/0/][version], -and the [Swift Code of Conduct][swift]. - -[homepage]: https://www.contributor-covenant.org -[version]: https://www.contributor-covenant.org/version/1/3/0/ -[swift]: https://swift.org/community/#code-of-conduct diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md deleted file mode 100644 index 49200523df40f..0000000000000 --- a/.github/CONTRIBUTING.md +++ /dev/null @@ -1,23 +0,0 @@ -# Contributing to pandas - -Whether you are a novice or experienced software developer, all contributions and suggestions are welcome! - -Our main contributing guide can be found [in this repo](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst) or [on the website](https://pandas.pydata.org/docs/dev/development/contributing.html). If you do not want to read it in its entirety, we will summarize the main ways in which you can contribute and point to relevant sections of that document for further information. - -## Getting Started - -If you are looking to contribute to the *pandas* codebase, the best place to start is the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues). This is also a great place for filing bug reports and making suggestions for ways in which we can improve the code and documentation. - -If you have additional questions, feel free to ask them on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas). Further information can also be found in the "[Where to start?](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst#where-to-start)" section. - -## Filing Issues - -If you notice a bug in the code or documentation, or have suggestions for how we can improve either, feel free to create an issue on the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) using [GitHub's "issue" form](https://github.com/pandas-dev/pandas/issues/new). The form contains some questions that will help us best address your issue. For more information regarding how to file issues against *pandas*, please refer to the "[Bug reports and enhancement requests](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst#bug-reports-and-enhancement-requests)" section. - -## Contributing to the Codebase - -The code is hosted on [GitHub](https://www.github.com/pandas-dev/pandas), so you will need to use [Git](https://git-scm.com/) to clone the project and make changes to the codebase. Once you have obtained a copy of the code, you should create a development environment that is separate from your existing Python environment so that you can make and test changes without compromising your own work environment. For more information, please refer to the "[Working with the code](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst#working-with-the-code)" section. - -Before submitting your changes for review, make sure to check that your changes do not break any tests. You can find more information about our test suites in the "[Test-driven development/code writing](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst#test-driven-development-code-writing)" section. We also have guidelines regarding coding style that will be enforced during testing, which can be found in the "[Code standards](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst#code-standards)" section. - -Once your changes are ready to be submitted, make sure to push your changes to GitHub before creating a pull request. Details about how to do that can be found in the "[Contributing your changes to pandas](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst#contributing-your-changes-to-pandas)" section. We will review your changes, and you will most likely be asked to make additional changes before it is finally ready to merge. However, once it's ready, we will merge it, and you will have successfully contributed to the codebase! diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml deleted file mode 100644 index 27dfded808b95..0000000000000 --- a/.github/FUNDING.yml +++ /dev/null @@ -1,3 +0,0 @@ -custom: https://pandas.pydata.org/donate.html -github: [numfocus] -tidelift: pypi/pandas diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index 765c1b8bff62e..0000000000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,39 +0,0 @@ ---- - -name: Bug Report -about: Create a bug report to help us improve pandas -title: "BUG:" -labels: "Bug, Needs Triage" - ---- - -- [ ] I have checked that this issue has not already been reported. - -- [ ] I have confirmed this bug exists on the latest version of pandas. - -- [ ] (optional) I have confirmed this bug exists on the master branch of pandas. - ---- - -**Note**: Please read [this guide](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your bug. - -#### Code Sample, a copy-pastable example - -```python -# Your code here - -``` - -#### Problem description - -[this should explain **why** the current behaviour is a problem and why the expected output is a better solution] - -#### Expected Output - -#### Output of ``pd.show_versions()`` - -
- -[paste the output of ``pd.show_versions()`` here leaving a blank line after the details tag] - -
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml new file mode 100644 index 0000000000000..36bc8dcf02bae --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yaml @@ -0,0 +1,68 @@ +name: Bug Report +description: Report incorrect behavior in the pandas library +title: "BUG: " +labels: [Bug, Needs Triage] + +body: + - type: checkboxes + id: checks + attributes: + label: Pandas version checks + options: + - label: > + I have checked that this issue has not already been reported. + required: true + - label: > + I have confirmed this bug exists on the + [latest version](https://pandas.pydata.org/docs/whatsnew/index.html) of pandas. + required: true + - label: > + I have confirmed this bug exists on the main branch of pandas. + - type: textarea + id: example + attributes: + label: Reproducible Example + description: > + Please follow [this guide](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) on how to + provide a minimal, copy-pastable example. + placeholder: > + import pandas as pd + + df = pd.DataFrame(range(5)) + + ... + render: python + validations: + required: true + - type: textarea + id: problem + attributes: + label: Issue Description + description: > + Please provide a description of the issue shown in the reproducible example. + validations: + required: true + - type: textarea + id: expected-behavior + attributes: + label: Expected Behavior + description: > + Please describe or show a code example of the expected behavior. + validations: + required: true + - type: textarea + id: version + attributes: + label: Installed Versions + description: > + Please paste the output of ``pd.show_versions()`` + value: > +
+ + + Replace this line with the output of pd.show_versions() + + +
+ validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/documentation_improvement.md b/.github/ISSUE_TEMPLATE/documentation_improvement.md deleted file mode 100644 index 3351ff9581121..0000000000000 --- a/.github/ISSUE_TEMPLATE/documentation_improvement.md +++ /dev/null @@ -1,22 +0,0 @@ ---- - -name: Documentation Improvement -about: Report wrong or missing documentation -title: "DOC:" -labels: "Docs, Needs Triage" - ---- - -#### Location of the documentation - -[this should provide the location of the documentation, e.g. "pandas.read_csv" or the URL of the documentation, e.g. "https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html"] - -**Note**: You can check the latest versions of the docs on `master` [here](https://pandas.pydata.org/docs/dev/). - -#### Documentation problem - -[this should provide a description of what documentation you believe needs to be fixed/improved] - -#### Suggested fix for documentation - -[this should explain the suggested fix and **why** it's better than the existing documentation] diff --git a/.github/ISSUE_TEMPLATE/documentation_improvement.yaml b/.github/ISSUE_TEMPLATE/documentation_improvement.yaml new file mode 100644 index 0000000000000..b89600f8598e7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation_improvement.yaml @@ -0,0 +1,41 @@ +name: Documentation Improvement +description: Report wrong or missing documentation +title: "DOC: " +labels: [Docs, Needs Triage] + +body: + - type: checkboxes + attributes: + label: Pandas version checks + options: + - label: > + I have checked that the issue still exists on the latest versions of the docs + on `main` [here](https://pandas.pydata.org/docs/dev/) + required: true + - type: textarea + id: location + attributes: + label: Location of the documentation + description: > + Please provide the location of the documentation, e.g. "pandas.read_csv" or the + URL of the documentation, e.g. + "https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html" + placeholder: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html + validations: + required: true + - type: textarea + id: problem + attributes: + label: Documentation problem + description: > + Please provide a description of what documentation you believe needs to be fixed/improved + validations: + required: true + - type: textarea + id: suggested-fix + attributes: + label: Suggested fix for documentation + description: > + Please explain the suggested fix and **why** it's better than the existing documentation + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md deleted file mode 100644 index 0c30b941bc520..0000000000000 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ /dev/null @@ -1,33 +0,0 @@ ---- - -name: Feature Request -about: Suggest an idea for pandas -title: "ENH:" -labels: "Enhancement, Needs Triage" - ---- - -#### Is your feature request related to a problem? - -[this should provide a description of what the problem is, e.g. "I wish I could use pandas to do [...]"] - -#### Describe the solution you'd like - -[this should provide a description of the feature request, e.g. "`DataFrame.foo` should get a new parameter `bar` that [...]", try to write a docstring for the desired feature] - -#### API breaking implications - -[this should provide a description of how this feature will affect the API] - -#### Describe alternatives you've considered - -[this should provide a description of any alternative solutions or features you've considered] - -#### Additional context - -[add any other context, code examples, or references to existing implementations about the feature request here] - -```python -# Your code here, if applicable - -``` diff --git a/.github/ISSUE_TEMPLATE/feature_request.yaml b/.github/ISSUE_TEMPLATE/feature_request.yaml new file mode 100644 index 0000000000000..f837eb1ca5bb7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yaml @@ -0,0 +1,72 @@ +name: Feature Request +description: Suggest an idea for pandas +title: "ENH: " +labels: [Enhancement, Needs Triage] + +body: + - type: checkboxes + id: checks + attributes: + label: Feature Type + description: Please check what type of feature request you would like to propose. + options: + - label: > + Adding new functionality to pandas + - label: > + Changing existing functionality in pandas + - label: > + Removing existing functionality in pandas + - type: textarea + id: description + attributes: + label: Problem Description + description: > + Please describe what problem the feature would solve, e.g. "I wish I could use pandas to ..." + placeholder: > + I wish I could use pandas to return a Series from a DataFrame when possible. + validations: + required: true + - type: textarea + id: feature + attributes: + label: Feature Description + description: > + Please describe how the new feature would be implemented, using psudocode if relevant. + placeholder: > + Add a new parameter to DataFrame, to_series, to return a Series if possible. + + def __init__(self, ..., to_series: bool=False): + """ + Parameters + ---------- + ... + + to_series : bool, default False + Return a Series if possible + """ + if to_series: + return Series(data) + validations: + required: true + - type: textarea + id: alternative + attributes: + label: Alternative Solutions + description: > + Please describe any alternative solution (existing functionality, 3rd party package, etc.) + that would satisfy the feature request. + placeholder: > + Write a custom function to return a Series when possible. + + def to_series(...) + result = pd.DataFrame(...) + ... + validations: + required: true + - type: textarea + id: context + attributes: + label: Additional Context + description: > + Please provide any relevant Github issues, code examples or references that help describe and support + the feature request. diff --git a/.github/ISSUE_TEMPLATE/installation_issue.yaml b/.github/ISSUE_TEMPLATE/installation_issue.yaml new file mode 100644 index 0000000000000..a80269ff0f12d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/installation_issue.yaml @@ -0,0 +1,66 @@ +name: Installation Issue +description: Report issues installing the pandas library on your system +title: "BUILD: " +labels: [Build, Needs Triage] + +body: + - type: checkboxes + id: checks + attributes: + label: Installation check + options: + - label: > + I have read the [installation guide](https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html#installing-pandas). + required: true + - type: input + id: platform + attributes: + label: Platform + description: > + Please provide the output of ``import platform; print(platform.platform())`` + validations: + required: true + - type: dropdown + id: method + attributes: + label: Installation Method + description: > + Please provide how you tried to install pandas from a clean environment. + options: + - pip install + - conda install + - apt-get install + - Built from source + - Other + validations: + required: true + - type: input + id: pandas + attributes: + label: pandas Version + description: > + Please provide the version of pandas you are trying to install. + validations: + required: true + - type: input + id: python + attributes: + label: Python Version + description: > + Please provide the installed version of Python. + validations: + required: true + - type: textarea + id: logs + attributes: + label: Installation Logs + description: > + If possible, please copy and paste the installation logs when attempting to install pandas. + value: > +
+ + + Replace this line with the installation logs. + + +
diff --git a/.github/ISSUE_TEMPLATE/performance_issue.yaml b/.github/ISSUE_TEMPLATE/performance_issue.yaml new file mode 100644 index 0000000000000..096e012f4ee0f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/performance_issue.yaml @@ -0,0 +1,53 @@ +name: Performance Issue +description: Report slow performance or memory issues when running pandas code +title: "PERF: " +labels: [Performance, Needs Triage] + +body: + - type: checkboxes + id: checks + attributes: + label: Pandas version checks + options: + - label: > + I have checked that this issue has not already been reported. + required: true + - label: > + I have confirmed this issue exists on the + [latest version](https://pandas.pydata.org/docs/whatsnew/index.html) of pandas. + required: true + - label: > + I have confirmed this issue exists on the main branch of pandas. + - type: textarea + id: example + attributes: + label: Reproducible Example + description: > + Please provide a minimal, copy-pastable example that quantifies + [slow runtime](https://docs.python.org/3/library/timeit.html) or + [memory](https://pypi.org/project/memory-profiler/) issues. + validations: + required: true + - type: textarea + id: version + attributes: + label: Installed Versions + description: > + Please paste the output of ``pd.show_versions()`` + value: > +
+ + + Replace this line with the output of pd.show_versions() + + +
+ validations: + required: true + - type: textarea + id: prior-performance + attributes: + label: Prior Performance + description: > + If applicable, please provide the prior version of pandas and output + of the same reproducible example where the performance issue did not exist. diff --git a/.github/ISSUE_TEMPLATE/submit_question.md b/.github/ISSUE_TEMPLATE/submit_question.md deleted file mode 100644 index 9b48918ff2f6d..0000000000000 --- a/.github/ISSUE_TEMPLATE/submit_question.md +++ /dev/null @@ -1,24 +0,0 @@ ---- - -name: Submit Question -about: Ask a general question about pandas -title: "QST:" -labels: "Usage Question, Needs Triage" - ---- - -- [ ] I have searched the [[pandas] tag](https://stackoverflow.com/questions/tagged/pandas) on StackOverflow for similar questions. - -- [ ] I have asked my usage related question on [StackOverflow](https://stackoverflow.com). - ---- - -#### Question about pandas - -**Note**: If you'd still like to submit a question, please read [this guide]( -https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your question. - -```python -# Your code here, if applicable - -``` diff --git a/.github/ISSUE_TEMPLATE/submit_question.yml b/.github/ISSUE_TEMPLATE/submit_question.yml new file mode 100644 index 0000000000000..6f73041b0f527 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/submit_question.yml @@ -0,0 +1,44 @@ +name: Submit Question +description: Ask a general question about pandas +title: "QST: " +labels: [Usage Question, Needs Triage] + +body: + - type: markdown + attributes: + value: > + Since [StackOverflow](https://stackoverflow.com) is better suited towards answering + usage questions, we ask that all usage questions are first asked on StackOverflow. + - type: checkboxes + attributes: + label: Research + options: + - label: > + I have searched the [[pandas] tag](https://stackoverflow.com/questions/tagged/pandas) + on StackOverflow for similar questions. + required: true + - label: > + I have asked my usage related question on [StackOverflow](https://stackoverflow.com). + required: true + - type: input + id: question-link + attributes: + label: Link to question on StackOverflow + validations: + required: true + - type: markdown + attributes: + value: --- + - type: textarea + id: question + attributes: + label: Question about pandas + description: > + **Note**: If you'd still like to submit a question, please read [this guide]( + https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing + how to provide the necessary information for us to reproduce your question. + placeholder: | + ```python + # Your code here, if applicable + + ``` diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7fb5a6ddf2024..876e5e2cfbb1e 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,4 +1,5 @@ -- [ ] closes #xxxx -- [ ] tests added / passed -- [ ] Ensure all linting tests pass, see [here](https://pandas.pydata.org/pandas-docs/dev/development/contributing.html#code-standards) for how to run them -- [ ] whatsnew entry +- [ ] closes #xxxx (Replace xxxx with the Github issue number) +- [ ] [Tests added and passed](https://pandas.pydata.org/pandas-docs/dev/development/contributing_codebase.html#writing-tests) if fixing a bug or adding a new feature +- [ ] All [code checks passed](https://pandas.pydata.org/pandas-docs/dev/development/contributing_codebase.html#pre-commit). +- [ ] Added [type annotations](https://pandas.pydata.org/pandas-docs/dev/development/contributing_codebase.html#type-hints) to new arguments/methods/functions. +- [ ] Added an entry in the latest `doc/source/whatsnew/vX.X.X.rst` file if fixing a bug or adding a new feature. diff --git a/.github/SECURITY.md b/.github/SECURITY.md deleted file mode 100644 index f3b059a5d4f13..0000000000000 --- a/.github/SECURITY.md +++ /dev/null @@ -1 +0,0 @@ -To report a security vulnerability to pandas, please go to https://tidelift.com/security and see the instructions there. diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index d4777bcd1d079..23bb988ef4d73 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -6,12 +6,17 @@ runs: - name: Environment Detail run: | - conda info - conda list - shell: bash -l {0} + micromamba info + micromamba list + shell: bash -el {0} - name: Build Pandas run: | - python setup.py build_ext -j 2 - python -m pip install -e . --no-build-isolation --no-use-pep517 - shell: bash -l {0} + python setup.py build_ext -j $N_JOBS + python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index + shell: bash -el {0} + env: + # Cannot use parallel compilation on Windows, see https://github.com/pandas-dev/pandas/issues/30873 + # GH 47305: Parallel build causes flaky ImportError: /home/runner/work/pandas/pandas/pandas/_libs/tslibs/timestamps.cpython-38-x86_64-linux-gnu.so: undefined symbol: pandas_datetime_to_datetimestruct + N_JOBS: 1 + #N_JOBS: ${{ runner.os == 'Windows' && 1 || 2 }} diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml new file mode 100644 index 0000000000000..2a7601f196ec4 --- /dev/null +++ b/.github/actions/run-tests/action.yml @@ -0,0 +1,27 @@ +name: Run tests and report results +runs: + using: composite + steps: + - name: Test + run: ci/run_tests.sh + shell: bash -el {0} + + - name: Publish test results + uses: actions/upload-artifact@v2 + with: + name: Test results + path: test-data.xml + if: failure() + + - name: Report Coverage + run: coverage report -m + shell: bash -el {0} + if: failure() + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v2 + with: + flags: unittests + name: codecov-pandas + fail_ci_if_error: false + if: failure() diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml new file mode 100644 index 0000000000000..7d1e54052f938 --- /dev/null +++ b/.github/actions/setup-conda/action.yml @@ -0,0 +1,36 @@ +name: Set up Conda environment +inputs: + environment-file: + description: Conda environment file to use. + default: environment.yml + environment-name: + description: Name to use for the Conda environment + default: test + extra-specs: + description: Extra packages to install + required: false + pyarrow-version: + description: If set, overrides the PyArrow version in the Conda environment to the given string. + required: false +runs: + using: composite + steps: + - name: Set Arrow version in ${{ inputs.environment-file }} to ${{ inputs.pyarrow-version }} + run: | + grep -q ' - pyarrow' ${{ inputs.environment-file }} + sed -i"" -e "s/ - pyarrow<10/ - pyarrow=${{ inputs.pyarrow-version }}/" ${{ inputs.environment-file }} + cat ${{ inputs.environment-file }} + shell: bash + if: ${{ inputs.pyarrow-version }} + + - name: Install ${{ inputs.environment-file }} + uses: mamba-org/provision-with-micromamba@v12 + with: + environment-file: ${{ inputs.environment-file }} + environment-name: ${{ inputs.environment-name }} + extra-specs: ${{ inputs.extra-specs }} + channels: conda-forge + channel-priority: ${{ runner.os == 'macOS' && 'flexible' || 'strict' }} + condarc-file: ci/condarc.yml + cache-env: true + cache-downloads: true diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml deleted file mode 100644 index 9ef00e7a85a6f..0000000000000 --- a/.github/actions/setup/action.yml +++ /dev/null @@ -1,12 +0,0 @@ -name: Set up pandas -description: Runs all the setup steps required to have a built pandas ready to use -runs: - using: composite - steps: - - name: Setting conda path - run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH - shell: bash -l {0} - - - name: Setup environment and build pandas - run: ci/setup_env.sh - shell: bash -l {0} diff --git a/.github/workflows/32-bit-linux.yml b/.github/workflows/32-bit-linux.yml new file mode 100644 index 0000000000000..49df3a077cfe7 --- /dev/null +++ b/.github/workflows/32-bit-linux.yml @@ -0,0 +1,53 @@ +name: 32 Bit Linux + +on: + push: + branches: + - main + - 1.5.x + pull_request: + branches: + - main + - 1.5.x + paths-ignore: + - "doc/**" + +permissions: + contents: read + +jobs: + pytest: + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Run 32-bit manylinux2014 Docker Build / Tests + run: | + # Without this (line 34), versioneer will not be able to determine the pandas version. + # This is because of a security update to git that blocks it from reading the config folder if + # it is not owned by the current user. We hit this since the "mounted" folder is not hit by the + # Docker container. + # xref https://github.com/pypa/manylinux/issues/1309 + docker pull quay.io/pypa/manylinux2014_i686 + docker run --platform linux/386 -v $(pwd):/pandas quay.io/pypa/manylinux2014_i686 \ + /bin/bash -xc "cd pandas && \ + git config --global --add safe.directory /pandas && \ + /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev && \ + . ~/virtualenvs/pandas-dev/bin/activate && \ + python -m pip install --no-deps -U pip wheel 'setuptools<60.0.0' && \ + pip install cython numpy python-dateutil pytz pytest pytest-xdist pytest-asyncio>=0.17 hypothesis && \ + python setup.py build_ext -q -j1 && \ + python -m pip install --no-build-isolation --no-use-pep517 -e . && \ + python -m pip list && \ + export PANDAS_CI=1 && \ + pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml" + + - name: Publish test results for Python 3.8-32 bit full Linux + uses: actions/upload-artifact@v3 + with: + name: Test results + path: test-data.xml + if: failure() diff --git a/.github/workflows/assign.yml b/.github/workflows/assign.yml index a6d3f1f383751..b3331060823a9 100644 --- a/.github/workflows/assign.yml +++ b/.github/workflows/assign.yml @@ -3,12 +3,17 @@ on: issue_comment: types: created +permissions: + contents: read + jobs: - one: - runs-on: ubuntu-latest + issue_assign: + permissions: + issues: write + pull-requests: write + runs-on: ubuntu-22.04 steps: - if: github.event.comment.body == 'take' - name: run: | echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees diff --git a/.github/workflows/asv-bot.yml b/.github/workflows/asv-bot.yml new file mode 100644 index 0000000000000..d264698e60485 --- /dev/null +++ b/.github/workflows/asv-bot.yml @@ -0,0 +1,78 @@ +name: "ASV Bot" + +on: + issue_comment: # Pull requests are issues + types: + - created + +env: + ENV_FILE: environment.yml + COMMENT: ${{github.event.comment.body}} + +permissions: + contents: read + +jobs: + autotune: + permissions: + contents: read + issues: write + pull-requests: write + name: "Run benchmarks" + # TODO: Support more benchmarking options later, against different branches, against self, etc + if: startsWith(github.event.comment.body, '@github-actions benchmark') + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash -el {0} + + concurrency: + # Set concurrency to prevent abuse(full runs are ~5.5 hours !!!) + # each user can only run one concurrent benchmark bot at a time + # We don't cancel in progress jobs, but if you want to benchmark multiple PRs, you're gonna have + # to wait + group: ${{ github.actor }}-asv + cancel-in-progress: false + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + # Although asv sets up its own env, deps are still needed + # during discovery process + - name: Set up Conda + uses: ./.github/actions/setup-conda + + - name: Run benchmarks + id: bench + continue-on-error: true # This is a fake failure, asv will exit code 1 for regressions + run: | + # extracting the regex, see https://stackoverflow.com/a/36798723 + REGEX=$(echo "$COMMENT" | sed -n "s/^.*-b\s*\(\S*\).*$/\1/p") + cd asv_bench + asv check -E existing + git remote add upstream https://github.com/pandas-dev/pandas.git + git fetch upstream + asv machine --yes + asv continuous -f 1.1 -b $REGEX upstream/main HEAD + echo 'BENCH_OUTPUT<> $GITHUB_ENV + asv compare -f 1.1 upstream/main HEAD >> $GITHUB_ENV + echo 'EOF' >> $GITHUB_ENV + echo "REGEX=$REGEX" >> $GITHUB_ENV + + - uses: actions/github-script@v6 + env: + BENCH_OUTPUT: ${{env.BENCH_OUTPUT}} + REGEX: ${{env.REGEX}} + with: + script: | + const ENV_VARS = process.env + const run_url = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}` + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: '\nBenchmarks completed. View runner logs here.' + run_url + '\nRegex used: '+ 'regex ' + ENV_VARS["REGEX"] + '\n' + ENV_VARS["BENCH_OUTPUT"] + }) diff --git a/.github/workflows/autoupdate-pre-commit-config.yml b/.github/workflows/autoupdate-pre-commit-config.yml index 801e063f72726..376aa8343c571 100644 --- a/.github/workflows/autoupdate-pre-commit-config.yml +++ b/.github/workflows/autoupdate-pre-commit-config.yml @@ -2,19 +2,25 @@ name: "Update pre-commit config" on: schedule: - - cron: "0 7 * * 1" # At 07:00 on each Monday. + - cron: "0 7 1 * *" # At 07:00 on 1st of every month. workflow_dispatch: +permissions: + contents: read + jobs: update-pre-commit: + permissions: + contents: write # for technote-space/create-pr-action to push code + pull-requests: write # for technote-space/create-pr-action to create a PR if: github.repository_owner == 'pandas-dev' name: Autoupdate pre-commit config - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 - name: Cache multiple paths - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: | ~/.cache/pre-commit diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index a5a802c678e20..0000000000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,171 +0,0 @@ -name: CI - -on: - push: - branches: [master] - pull_request: - branches: - - master - - 1.2.x - -env: - ENV_FILE: environment.yml - PANDAS_CI: 1 - -jobs: - checks: - name: Checks - runs-on: ubuntu-latest - defaults: - run: - shell: bash -l {0} - - steps: - - name: Checkout - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Looking for unwanted patterns - run: ci/code_checks.sh patterns - if: always() - - - name: Cache conda - uses: actions/cache@v2 - with: - path: ~/conda_pkgs_dir - key: ${{ runner.os }}-conda-${{ hashFiles('${{ env.ENV_FILE }}') }} - - - uses: conda-incubator/setup-miniconda@v2 - with: - activate-environment: pandas-dev - channel-priority: strict - environment-file: ${{ env.ENV_FILE }} - use-only-tar-bz2: true - - - name: Build Pandas - uses: ./.github/actions/build_pandas - - - name: Linting - run: ci/code_checks.sh lint - if: always() - - - name: Checks on imported code - run: ci/code_checks.sh code - if: always() - - - name: Running doctests - run: ci/code_checks.sh doctests - if: always() - - - name: Docstring validation - run: ci/code_checks.sh docstrings - if: always() - - - name: Typing validation - run: ci/code_checks.sh typing - if: always() - - - name: Testing docstring validation script - run: pytest scripts - if: always() - - - name: Running benchmarks - run: | - cd asv_bench - asv check -E existing - git remote add upstream https://github.com/pandas-dev/pandas.git - git fetch upstream - asv machine --yes - asv dev | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log - if grep "failed" benchmarks.log > /dev/null ; then - exit 1 - fi - if: always() - - - name: Publish benchmarks artifact - uses: actions/upload-artifact@master - with: - name: Benchmarks log - path: asv_bench/benchmarks.log - if: failure() - - web_and_docs: - name: Web and docs - runs-on: ubuntu-latest - steps: - - - name: Checkout - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Set up pandas - uses: ./.github/actions/setup - - - name: Build website - run: | - source activate pandas-dev - python web/pandas_web.py web/pandas --target-path=web/build - - name: Build documentation - run: | - source activate pandas-dev - doc/make.py --warnings-are-errors | tee sphinx.log ; exit ${PIPESTATUS[0]} - - # This can be removed when the ipython directive fails when there are errors, - # including the `tee sphinx.log` in te previous step (https://github.com/ipython/ipython/issues/11547) - - name: Check ipython directive errors - run: "! grep -B10 \"^<<<-------------------------------------------------------------------------$\" sphinx.log" - - - name: Install ssh key - run: | - mkdir -m 700 -p ~/.ssh - echo "${{ secrets.server_ssh_key }}" > ~/.ssh/id_rsa - chmod 600 ~/.ssh/id_rsa - echo "${{ secrets.server_ip }} ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBE1Kkopomm7FHG5enATf7SgnpICZ4W2bw+Ho+afqin+w7sMcrsa0je7sbztFAV8YchDkiBKnWTG4cRT+KZgZCaY=" > ~/.ssh/known_hosts - if: github.event_name == 'push' - - - name: Upload web - run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' --exclude='Pandas_Cheat_Sheet*' web/build/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas - if: github.event_name == 'push' - - - name: Upload dev docs - run: rsync -az --delete doc/build/html/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas/pandas-docs/dev - if: github.event_name == 'push' - - - name: Move docs into site directory - run: mv doc/build/html web/build/docs - - name: Save website as an artifact - uses: actions/upload-artifact@v2 - with: - name: website - path: web/build - retention-days: 14 - - data_manager: - name: Test experimental data manager - runs-on: ubuntu-latest - strategy: - matrix: - pattern: ["not slow and not network and not clipboard", "slow"] - steps: - - - name: Checkout - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Set up pandas - uses: ./.github/actions/setup - - - name: Run tests - env: - PANDAS_DATA_MANAGER: array - PATTERN: ${{ matrix.pattern }} - PYTEST_WORKERS: "auto" - run: | - source activate pandas-dev - ci/run_tests.sh - - - name: Print skipped tests - run: python ci/print_skipped.py diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml new file mode 100644 index 0000000000000..cb95b224ba677 --- /dev/null +++ b/.github/workflows/code-checks.yml @@ -0,0 +1,184 @@ +name: Code Checks + +on: + push: + branches: + - main + - 1.5.x + pull_request: + branches: + - main + - 1.5.x + +env: + ENV_FILE: environment.yml + PANDAS_CI: 1 + +permissions: + contents: read + +jobs: + pre_commit: + name: pre-commit + runs-on: ubuntu-22.04 + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-pre-commit + cancel-in-progress: true + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Install Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Run pre-commit + uses: pre-commit/action@v2.0.3 + + typing_and_docstring_validation: + name: Docstring and typing validation + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash -el {0} + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-code-checks + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + + - name: Build Pandas + id: build + uses: ./.github/actions/build_pandas + + # The following checks are independent of each other and should still be run if one fails + - name: Check for no warnings when building single-page docs + run: ci/code_checks.sh single-docs + if: ${{ steps.build.outcome == 'success' && always() }} + + - name: Run checks on imported code + run: ci/code_checks.sh code + if: ${{ steps.build.outcome == 'success' && always() }} + + - name: Run doctests + run: ci/code_checks.sh doctests + if: ${{ steps.build.outcome == 'success' && always() }} + + - name: Run docstring validation + run: ci/code_checks.sh docstrings + if: ${{ steps.build.outcome == 'success' && always() }} + + - name: Use existing environment for type checking + run: | + echo $PATH >> $GITHUB_PATH + echo "PYTHONHOME=$PYTHONHOME" >> $GITHUB_ENV + echo "PYTHONPATH=$PYTHONPATH" >> $GITHUB_ENV + if: ${{ steps.build.outcome == 'success' && always() }} + + - name: Typing + uses: pre-commit/action@v2.0.3 + with: + extra_args: --hook-stage manual --all-files + if: ${{ steps.build.outcome == 'success' && always() }} + + - name: Run docstring validation script tests + run: pytest scripts + if: ${{ steps.build.outcome == 'success' && always() }} + + asv-benchmarks: + name: ASV Benchmarks + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash -el {0} + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-asv-benchmarks + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + + - name: Build Pandas + id: build + uses: ./.github/actions/build_pandas + + - name: Run ASV benchmarks + run: | + cd asv_bench + asv machine --yes + asv run --quick --dry-run --strict --durations=30 --python=same + + build_docker_dev_environment: + name: Build Docker Dev Environment + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash -el {0} + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-build_docker_dev_environment + cancel-in-progress: true + + steps: + - name: Clean up dangling images + run: docker image prune -f + + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Build image + run: docker build --pull --no-cache --tag pandas-dev-env . + + - name: Show environment + run: docker run --rm pandas-dev-env python -c "import pandas as pd; print(pd.show_versions())" + + requirements-dev-text-installable: + name: Test install requirements-dev.txt + runs-on: ubuntu-22.04 + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-requirements-dev-text-installable + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Setup Python + id: setup_python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + cache: 'pip' + cache-dependency-path: 'requirements-dev.txt' + + - name: Install requirements-dev.txt + run: pip install -r requirements-dev.txt + + - name: Check Pip Cache Hit + run: echo ${{ steps.setup_python.outputs.cache-hit }} diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000000000..05a5d003c1dd1 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,31 @@ +name: CodeQL +on: + schedule: + # every day at midnight + - cron: "0 0 * * *" + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +jobs: + analyze: + runs-on: ubuntu-22.04 + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: + - python + + steps: + - uses: actions/checkout@v3 + - uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + - uses: github/codeql-action/autobuild@v2 + - uses: github/codeql-action/analyze@v2 diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml deleted file mode 100644 index dc396be753269..0000000000000 --- a/.github/workflows/comment_bot.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: Comment-bot - -on: - issue_comment: - types: - - created - - edited - -jobs: - autotune: - name: "Fixup pre-commit formatting" - if: startsWith(github.event.comment.body, '@github-actions pre-commit') - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: r-lib/actions/pr-fetch@master - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - - name: Cache multiple paths - uses: actions/cache@v2 - with: - path: | - ~/.cache/pre-commit - ~/.cache/pip - key: pre-commit-dispatched-${{ runner.os }}-build - - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - name: Install-pre-commit - run: python -m pip install --upgrade pre-commit - - name: Run pre-commit - run: pre-commit run --from-ref=origin/master --to-ref=HEAD --all-files || (exit 0) - - name: Commit results - run: | - git config user.name "$(git log -1 --pretty=format:%an)" - git config user.email "$(git log -1 --pretty=format:%ae)" - git commit -a -m 'Fixes from pre-commit [automated commit]' || echo "No changes to commit" - - uses: r-lib/actions/pr-push@master - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml deleted file mode 100644 index 292598dfcab73..0000000000000 --- a/.github/workflows/database.yml +++ /dev/null @@ -1,106 +0,0 @@ -name: Database - -on: - push: - branches: [master] - pull_request: - branches: - - master - - 1.2.x - paths-ignore: - - "doc/**" - -env: - PYTEST_WORKERS: "auto" - PANDAS_CI: 1 - PATTERN: ((not slow and not network and not clipboard) or (single and db)) - COVERAGE: true - -jobs: - Linux_py37_IO: - runs-on: ubuntu-latest - defaults: - run: - shell: bash -l {0} - - strategy: - matrix: - ENV_FILE: [ci/deps/actions-37-db-min.yaml, ci/deps/actions-37-db.yaml] - fail-fast: false - - services: - mysql: - image: mysql - env: - MYSQL_ALLOW_EMPTY_PASSWORD: yes - MYSQL_DATABASE: pandas - options: >- - --health-cmd "mysqladmin ping" - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 3306:3306 - - postgres: - image: postgres - env: - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - POSTGRES_DB: pandas - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 5432:5432 - - steps: - - name: Checkout - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Cache conda - uses: actions/cache@v2 - env: - CACHE_NUMBER: 0 - with: - path: ~/conda_pkgs_dir - key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ - hashFiles('${{ matrix.ENV_FILE }}') }} - - - uses: conda-incubator/setup-miniconda@v2 - with: - activate-environment: pandas-dev - channel-priority: flexible - environment-file: ${{ matrix.ENV_FILE }} - use-only-tar-bz2: true - - - name: Build Pandas - uses: ./.github/actions/build_pandas - - - name: Test - run: pytest -m "${{ env.PATTERN }}" -n 2 --dist=loadfile --cov=pandas --cov-report=xml pandas/tests/io - if: always() - - - name: Build Version - run: pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - - - name: Publish test results - uses: actions/upload-artifact@master - with: - name: Test results - path: test-data.xml - if: failure() - - - name: Print skipped tests - run: python ci/print_skipped.py - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 - with: - flags: unittests - name: codecov-pandas - fail_ci_if_error: true diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml new file mode 100644 index 0000000000000..13da56806de6e --- /dev/null +++ b/.github/workflows/docbuild-and-upload.yml @@ -0,0 +1,93 @@ +name: Doc Build and Upload + +on: + push: + branches: + - main + - 1.5.x + tags: + - '*' + pull_request: + branches: + - main + - 1.5.x + +env: + ENV_FILE: environment.yml + PANDAS_CI: 1 + +permissions: + contents: read + +jobs: + web_and_docs: + name: Doc Build and Upload + runs-on: ubuntu-22.04 + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-web-docs + cancel-in-progress: true + + defaults: + run: + shell: bash -el {0} + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + + - name: Build Pandas + uses: ./.github/actions/build_pandas + + - name: Build website + run: python web/pandas_web.py web/pandas --target-path=web/build + + - name: Build documentation + run: doc/make.py --warnings-are-errors + + - name: Build documentation zip + run: doc/make.py zip_html + + - name: Build the interactive terminal + run: | + cd web/interactive_terminal + jupyter lite build + + - name: Install ssh key + run: | + mkdir -m 700 -p ~/.ssh + echo "${{ secrets.server_ssh_key }}" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + echo "${{ secrets.server_ip }} ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBFjYkJBk7sos+r7yATODogQc3jUdW1aascGpyOD4bohj8dWjzwLJv/OJ/fyOQ5lmj81WKDk67tGtqNJYGL9acII=" > ~/.ssh/known_hosts + if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')) + + - name: Copy cheatsheets into site directory + run: cp doc/cheatsheet/Pandas_Cheat_Sheet* web/build/ + + - name: Upload web + run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' web/build/ web@${{ secrets.server_ip }}:/var/www/html + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + + - name: Upload dev docs + run: rsync -az --delete doc/build/html/ web@${{ secrets.server_ip }}:/var/www/html/pandas-docs/dev + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + + - name: Upload prod docs + run: rsync -az --delete doc/build/html/ web@${{ secrets.server_ip }}:/var/www/html/pandas-docs/version/${GITHUB_REF_NAME:1} + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + + - name: Move docs into site directory + run: mv doc/build/html web/build/docs + + - name: Save website as an artifact + uses: actions/upload-artifact@v3 + with: + name: website + path: web/build + retention-days: 14 diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml new file mode 100644 index 0000000000000..5da2d0d281edd --- /dev/null +++ b/.github/workflows/macos-windows.yml @@ -0,0 +1,61 @@ +name: Windows-MacOS + +on: + push: + branches: + - main + - 1.5.x + pull_request: + branches: + - main + - 1.5.x + paths-ignore: + - "doc/**" + +env: + PANDAS_CI: 1 + PYTEST_TARGET: pandas + PATTERN: "not slow and not db and not network and not single_cpu" + + +permissions: + contents: read + +jobs: + pytest: + defaults: + run: + shell: bash -el {0} + timeout-minutes: 180 + strategy: + matrix: + os: [macos-latest, windows-latest] + env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml] + fail-fast: false + runs-on: ${{ matrix.os }} + name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }} + cancel-in-progress: true + env: + # GH 47443: PYTEST_WORKERS > 1 crashes Windows builds with memory related errors + PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '1' }} + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: ci/deps/${{ matrix.env_file }} + pyarrow-version: ${{ matrix.os == 'macos-latest' && '6' || '' }} + + - name: Build Pandas + uses: ./.github/actions/build_pandas + + - name: Test + uses: ./.github/actions/run-tests diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml deleted file mode 100644 index cb7d3fb5cabcf..0000000000000 --- a/.github/workflows/posix.yml +++ /dev/null @@ -1,97 +0,0 @@ -name: Posix - -on: - push: - branches: [master] - pull_request: - branches: - - master - - 1.2.x - paths-ignore: - - "doc/**" - -env: - PYTEST_WORKERS: "auto" - PANDAS_CI: 1 - -jobs: - pytest: - runs-on: ubuntu-latest - defaults: - run: - shell: bash -l {0} - strategy: - matrix: - settings: [ - [actions-37-minimum_versions.yaml, "not slow and not network and not clipboard", "", "", "", "", ""], - [actions-37.yaml, "not slow and not network and not clipboard", "", "", "", "", ""], - [actions-37-locale_slow.yaml, "slow", "language-pack-it xsel", "it_IT.utf8", "it_IT.utf8", "", ""], - [actions-37-slow.yaml, "slow", "", "", "", "", ""], - [actions-38.yaml, "not slow and not network and not clipboard", "", "", "", "", ""], - [actions-38-slow.yaml, "slow", "", "", "", "", ""], - [actions-38-locale.yaml, "not slow and not network", "language-pack-zh-hans xsel", "zh_CN.utf8", "zh_CN.utf8", "", ""], - [actions-38-numpydev.yaml, "not slow and not network", "xsel", "", "", "deprecate", "-W error"], - [actions-39.yaml, "not slow and not network and not clipboard", "", "", "", "", ""] - ] - fail-fast: false - env: - COVERAGE: true - ENV_FILE: ci/deps/${{ matrix.settings[0] }} - PATTERN: ${{ matrix.settings[1] }} - EXTRA_APT: ${{ matrix.settings[2] }} - LANG: ${{ matrix.settings[3] }} - LC_ALL: ${{ matrix.settings[4] }} - PANDAS_TESTING_MODE: ${{ matrix.settings[5] }} - TEST_ARGS: ${{ matrix.settings[6] }} - - steps: - - name: Checkout - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Cache conda - uses: actions/cache@v2 - env: - CACHE_NUMBER: 0 - with: - path: ~/conda_pkgs_dir - key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ - hashFiles('${{ env.ENV_FILE }}') }} - - - name: Extra installs - run: sudo apt-get update && sudo apt-get install -y libc6-dev-i386 ${{ env.EXTRA_APT }} - - - uses: conda-incubator/setup-miniconda@v2 - with: - activate-environment: pandas-dev - channel-priority: flexible - environment-file: ${{ env.ENV_FILE }} - use-only-tar-bz2: true - - - name: Build Pandas - uses: ./.github/actions/build_pandas - - - name: Test - run: ci/run_tests.sh - if: always() - - - name: Build Version - run: pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - - - name: Publish test results - uses: actions/upload-artifact@master - with: - name: Test results - path: test-data.xml - if: failure() - - - name: Print skipped tests - run: python ci/print_skipped.py - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 - with: - flags: unittests - name: codecov-pandas - fail_ci_if_error: false diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml deleted file mode 100644 index 723347913ac38..0000000000000 --- a/.github/workflows/pre-commit.yml +++ /dev/null @@ -1,14 +0,0 @@ -name: pre-commit - -on: - pull_request: - push: - branches: [master] - -jobs: - pre-commit: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - - uses: pre-commit/action@v2.0.0 diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml index 38b1aa9ae7047..0d265182b3924 100644 --- a/.github/workflows/python-dev.yml +++ b/.github/workflows/python-dev.yml @@ -1,72 +1,92 @@ +# This workflow may or may not run depending on the state of the next +# unreleased Python version. DO NOT DELETE IT. +# +# In general, this file will remain frozen(present, but not running) until: +# - The next unreleased Python version has released beta 1 +# - This version should be available on Github Actions. +# - Our required build/runtime dependencies(numpy, pytz, Cython, python-dateutil) +# support that unreleased Python version. +# To unfreeze, comment out the ``if: false`` condition, and make sure you update +# the name of the workflow and Python version in actions/setup-python to: '3.12-dev' +# +# After it has been unfrozen, this file should remain unfrozen(present, and running) until: +# - The next Python version has been officially released. +# OR +# - Most/All of our optional dependencies support Python 3.11 AND +# - The next Python version has released a rc(we are guaranteed a stable ABI). +# To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs +# to the corresponding posix/windows-macos/sdist etc. workflows. +# Feel free to modify this comment as necessary. + name: Python Dev on: push: branches: - - master + - main + - 1.5.x pull_request: branches: - - master + - main + - 1.5.x paths-ignore: - "doc/**" +env: + PYTEST_WORKERS: "auto" + PANDAS_CI: 1 + PATTERN: "not slow and not network and not clipboard and not single_cpu" + COVERAGE: true + PYTEST_TARGET: pandas + +permissions: + contents: read + jobs: build: - runs-on: ubuntu-latest - name: actions-310-dev - timeout-minutes: 60 + # if: false # Uncomment this to freeze the workflow, comment it to unfreeze + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-22.04, macOS-latest, windows-latest] + + name: actions-311-dev + timeout-minutes: 120 + + concurrency: + #https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev + cancel-in-progress: true steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: fetch-depth: 0 - name: Set up Python Dev Version - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: - python-version: '3.10-dev' + python-version: '3.11-dev' - name: Install dependencies run: | + python --version python -m pip install --upgrade pip setuptools wheel - pip install git+https://github.com/numpy/numpy.git - pip install git+https://github.com/pytest-dev/pytest.git - pip install git+https://github.com/nedbat/coveragepy.git - pip install cython python-dateutil pytz hypothesis pytest-xdist - pip list + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy + python -m pip install git+https://github.com/nedbat/coveragepy.git + python -m pip install python-dateutil pytz cython hypothesis==6.52.1 pytest>=6.2.5 pytest-xdist pytest-cov pytest-asyncio>=0.17 + python -m pip list + # GH 47305: Parallel build can cause flaky ImportError from pandas/_libs/tslibs - name: Build Pandas run: | - python setup.py build_ext -q -j2 - python -m pip install -e . --no-build-isolation --no-use-pep517 + python setup.py build_ext -q -j1 + python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index - name: Build Version run: | python -c "import pandas; pandas.show_versions();" - - name: Test with pytest - run: | - coverage run -m pytest -m 'not slow and not network and not clipboard' pandas - continue-on-error: true - - - name: Publish test results - uses: actions/upload-artifact@master - with: - name: Test results - path: test-data.xml - if: failure() - - - name: Print skipped tests - run: | - python ci/print_skipped.py - - - name: Report Coverage - run: | - coverage report -m - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 - with: - flags: unittests - name: codecov-pandas - fail_ci_if_error: true + - name: Test + uses: ./.github/actions/run-tests diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml new file mode 100644 index 0000000000000..46b453532ad0b --- /dev/null +++ b/.github/workflows/sdist.yml @@ -0,0 +1,95 @@ +name: sdist + +on: + push: + branches: + - main + - 1.5.x + pull_request: + branches: + - main + - 1.5.x + types: [labeled, opened, synchronize, reopened] + paths-ignore: + - "doc/**" + +permissions: + contents: read + +jobs: + build: + if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} + runs-on: ubuntu-22.04 + timeout-minutes: 60 + defaults: + run: + shell: bash -el {0} + + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11"] + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{matrix.python-version}}-sdist + cancel-in-progress: true + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + + # GH 39416 + pip install numpy + + - name: Build pandas sdist + run: | + pip list + python setup.py sdist --formats=gztar + + - name: Upload sdist artifact + uses: actions/upload-artifact@v3 + with: + name: ${{matrix.python-version}}-sdist.gz + path: dist/*.gz + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: false + environment-name: pandas-sdist + extra-specs: | + python =${{ matrix.python-version }} + + - name: Install pandas from sdist + run: | + pip list + python -m pip install dist/*.gz + + - name: Force oldest supported NumPy + run: | + case "${{matrix.python-version}}" in + 3.8) + pip install numpy==1.20.3 ;; + 3.9) + pip install numpy==1.20.3 ;; + 3.10) + pip install numpy==1.21.2 ;; + 3.11) + pip install numpy==1.23.2 ;; + esac + + - name: Import pandas + run: | + cd .. + conda list + python -c "import pandas; pandas.show_versions();" diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml index 2f55a180bc88c..c47745e097d17 100644 --- a/.github/workflows/stale-pr.yml +++ b/.github/workflows/stale-pr.yml @@ -4,18 +4,23 @@ on: # * is a special character in YAML so you have to quote this string - cron: "0 0 * * *" +permissions: + contents: read + jobs: stale: - runs-on: ubuntu-latest + permissions: + pull-requests: write + runs-on: ubuntu-22.04 steps: - - uses: actions/stale@v3 + - uses: actions/stale@v4 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity. Please update or respond to this comment if you're still interested in working on this." - skip-stale-pr-message: false + stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity. Please [update](https://pandas.pydata.org/pandas-docs/stable/development/contributing.html#updating-your-pull-request) and respond to this comment if you're still interested in working on this." stale-pr-label: "Stale" exempt-pr-labels: "Needs Review,Blocked,Needs Discussion" - days-before-stale: 30 + days-before-issue-stale: -1 + days-before-pr-stale: 30 days-before-close: -1 remove-stale-when-updated: false debug-only: false diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml new file mode 100644 index 0000000000000..4602d12d8505e --- /dev/null +++ b/.github/workflows/ubuntu.yml @@ -0,0 +1,161 @@ +name: Ubuntu + +on: + push: + branches: + - main + - 1.5.x + pull_request: + branches: + - main + - 1.5.x + paths-ignore: + - "doc/**" + +env: + PANDAS_CI: 1 + +permissions: + contents: read + +jobs: + pytest: + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash -el {0} + timeout-minutes: 180 + strategy: + matrix: + env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml] + pattern: ["not single_cpu", "single_cpu"] + # Don't test pyarrow v2/3: Causes timeouts in read_csv engine + # even if tests are skipped/xfailed + pyarrow_version: ["5", "6", "7"] + include: + - name: "Downstream Compat" + env_file: actions-38-downstream_compat.yaml + pattern: "not slow and not network and not single_cpu" + pytest_target: "pandas/tests/test_downstream.py" + - name: "Minimum Versions" + env_file: actions-38-minimum_versions.yaml + pattern: "not slow and not network and not single_cpu" + - name: "Locale: it_IT.utf8" + env_file: actions-38.yaml + pattern: "not slow and not network and not single_cpu" + extra_apt: "language-pack-it" + lang: "it_IT.utf8" + lc_all: "it_IT.utf8" + - name: "Locale: zh_CN.utf8" + env_file: actions-38.yaml + pattern: "not slow and not network and not single_cpu" + extra_apt: "language-pack-zh-hans" + lang: "zh_CN.utf8" + lc_all: "zh_CN.utf8" + - name: "Copy-on-Write" + env_file: actions-310.yaml + pattern: "not slow and not network and not single_cpu" + pandas_copy_on_write: "1" + - name: "Data Manager" + env_file: actions-38.yaml + pattern: "not slow and not network and not single_cpu" + pandas_data_manager: "array" + - name: "Pypy" + env_file: actions-pypy-38.yaml + pattern: "not slow and not network and not single_cpu" + test_args: "--max-worker-restart 0" + - name: "Numpy Dev" + env_file: actions-310-numpydev.yaml + pattern: "not slow and not network and not single_cpu" + pandas_testing_mode: "deprecate" + test_args: "-W error::DeprecationWarning:numpy -W error::FutureWarning:numpy" + exclude: + - env_file: actions-39.yaml + pyarrow_version: "6" + - env_file: actions-39.yaml + pyarrow_version: "7" + - env_file: actions-310.yaml + pyarrow_version: "6" + - env_file: actions-310.yaml + pyarrow_version: "7" + fail-fast: false + name: ${{ matrix.name || format('{0} pyarrow={1} {2}', matrix.env_file, matrix.pyarrow_version, matrix.pattern) }} + env: + ENV_FILE: ci/deps/${{ matrix.env_file }} + PATTERN: ${{ matrix.pattern }} + EXTRA_APT: ${{ matrix.extra_apt || '' }} + LANG: ${{ matrix.lang || '' }} + LC_ALL: ${{ matrix.lc_all || '' }} + PANDAS_TESTING_MODE: ${{ matrix.pandas_testing_mode || '' }} + PANDAS_DATA_MANAGER: ${{ matrix.pandas_data_manager || 'block' }} + PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} + TEST_ARGS: ${{ matrix.test_args || '' }} + PYTEST_WORKERS: ${{ contains(matrix.pattern, 'not single_cpu') && 'auto' || '1' }} + PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} + IS_PYPY: ${{ contains(matrix.env_file, 'pypy') }} + # TODO: re-enable coverage on pypy, its slow + COVERAGE: ${{ !contains(matrix.env_file, 'pypy') }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.pyarrow_version || '' }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_data_manager || '' }} + cancel-in-progress: true + + services: + mysql: + image: mysql + env: + MYSQL_ALLOW_EMPTY_PASSWORD: yes + MYSQL_DATABASE: pandas + options: >- + --health-cmd "mysqladmin ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 3306:3306 + + postgres: + image: postgres + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: pandas + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + moto: + image: motoserver/moto + env: + AWS_ACCESS_KEY_ID: foobar_key + AWS_SECRET_ACCESS_KEY: foobar_secret + ports: + - 5000:5000 + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Extra installs + # xsel for clipboard tests + run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }} + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: ${{ env.ENV_FILE }} + pyarrow-version: ${{ matrix.pyarrow_version }} + + - name: Build Pandas + uses: ./.github/actions/build_pandas + + - name: Test + uses: ./.github/actions/run-tests + # TODO: Don't continue on error for PyPy + continue-on-error: ${{ env.IS_PYPY == 'true' }} diff --git a/.gitignore b/.gitignore index 2c337be60e94e..07b1f056d511b 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,8 @@ dist *.egg-info .eggs .pypirc +# type checkers +pandas/py.typed # tox testing tool .tox @@ -120,3 +122,7 @@ doc/build/html/index.html doc/tmp.sv env/ doc/source/savefig/ + +# Interactive terminal generated files # +######################################## +.jupyterlite.doit.db diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d580fcf4fc545..2ca5b5c9b896b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,32 +1,42 @@ -minimum_pre_commit_version: 2.9.2 +minimum_pre_commit_version: 2.15.0 exclude: ^LICENSES/|\.(html|csv|svg)$ +# reserve "manual" for mypy and pyright +default_stages: [commit, merge-commit, push, prepare-commit-msg, commit-msg, post-checkout, post-commit, post-merge, post-rewrite] ci: autofix_prs: false repos: - repo: https://github.com/MarcoGorelli/absolufy-imports - rev: v0.3.0 + rev: v0.3.1 hooks: - id: absolufy-imports files: ^pandas/ +- repo: https://github.com/jendrikseipp/vulture + rev: 'v2.5' + hooks: + - id: vulture + entry: python scripts/run_vulture.py + pass_filenames: true + require_serial: false - repo: https://github.com/python/black - rev: 21.5b2 + rev: 22.6.0 hooks: - id: black - repo: https://github.com/codespell-project/codespell - rev: v2.0.0 + rev: v2.1.0 hooks: - id: codespell types_or: [python, rst, markdown] - files: ^(pandas|doc)/ - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.0.1 + rev: v4.3.0 hooks: - id: debug-statements - id: end-of-file-fixer exclude: \.txt$ + stages: [commit, merge-commit, push, prepare-commit-msg, commit-msg, post-checkout, post-commit, post-merge, post-rewrite] - id: trailing-whitespace + stages: [commit, merge-commit, push, prepare-commit-msg, commit-msg, post-checkout, post-commit, post-merge, post-rewrite] - repo: https://github.com/cpplint/cpplint - rev: 1.5.5 + rev: 1.6.0 hooks: - id: cpplint # We don't lint all C files because we don't want to lint any that are built @@ -35,34 +45,25 @@ repos: # we can lint all header files since they aren't "generated" like C files are. exclude: ^pandas/_libs/src/(klib|headers)/ args: [--quiet, '--extensions=c,h', '--headers=h', --recursive, '--filter=-readability/casting,-runtime/int,-build/include_subdir'] -- repo: https://gitlab.com/pycqa/flake8 - rev: 3.9.2 +- repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 hooks: - id: flake8 - additional_dependencies: - - flake8-comprehensions==3.1.0 - - flake8-bugbear==21.3.2 - - pandas-dev-flaker==0.2.0 - - id: flake8 - name: flake8 (cython) - types: [cython] - args: [--append-config=flake8/cython.cfg] - - id: flake8 - name: flake8 (cython template) - files: \.pxi\.in$ - types: [text] - args: [--append-config=flake8/cython-template.cfg] + additional_dependencies: &flake8_dependencies + - flake8==5.0.4 + - flake8-bugbear==22.7.1 + - pandas-dev-flaker==0.5.0 - repo: https://github.com/PyCQA/isort - rev: 5.8.0 + rev: 5.10.1 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v2.18.3 + rev: v2.37.3 hooks: - id: pyupgrade - args: [--py37-plus] + args: [--py38-plus] - repo: https://github.com/pre-commit/pygrep-hooks - rev: v1.8.0 + rev: v1.9.0 hooks: - id: rst-backticks - id: rst-directive-colons @@ -71,17 +72,54 @@ repos: - id: rst-inline-touching-normal types: [text] # overwrite types: [rst] types_or: [python, rst] +- repo: https://github.com/sphinx-contrib/sphinx-lint + rev: v0.6.1 + hooks: + - id: sphinx-lint - repo: https://github.com/asottile/yesqa - rev: v1.2.3 + rev: v1.3.0 hooks: - id: yesqa - additional_dependencies: - - flake8==3.9.2 - - flake8-comprehensions==3.1.0 - - flake8-bugbear==21.3.2 - - pandas-dev-flaker==0.2.0 + additional_dependencies: *flake8_dependencies - repo: local hooks: + - id: pyright + # note: assumes python env is setup and activated + name: pyright + entry: pyright + language: node + pass_filenames: false + types: [python] + stages: [manual] + additional_dependencies: &pyright_dependencies + - pyright@1.1.264 + - id: pyright_reportGeneralTypeIssues + # note: assumes python env is setup and activated + name: pyright reportGeneralTypeIssues + entry: pyright --skipunannotated -p pyright_reportGeneralTypeIssues.json + language: node + pass_filenames: false + types: [python] + stages: [manual] + additional_dependencies: *pyright_dependencies + - id: mypy + # note: assumes python env is setup and activated + name: mypy + entry: mypy + language: system + pass_filenames: false + types: [python] + stages: [manual] + - id: stubtest + # note: assumes python env is setup and activated + # note: requires pandas dev to be installed + name: mypy (stubtest) + entry: python + language: system + pass_filenames: false + types: [pyi] + args: [scripts/run_stubtest.py] + stages: [manual] - id: flake8-rst name: flake8-rst description: Run flake8 on code snippets in docstrings or RST files @@ -102,7 +140,42 @@ repos: # Incorrect code-block / IPython directives |\.\.\ code-block\ :: |\.\.\ ipython\ :: + # directive should not have a space before :: + |\.\.\ \w+\ :: + + # Check for deprecated messages without sphinx directive + |(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.) types_or: [python, cython, rst] + - id: cython-casting + name: Check Cython casting is `obj`, not ` obj` + language: pygrep + entry: '[a-zA-Z0-9*]> ' + files: (\.pyx|\.pxi.in)$ + - id: incorrect-backticks + name: Check for backticks incorrectly rendering because of missing spaces + language: pygrep + entry: '[a-zA-Z0-9]\`\`?[a-zA-Z0-9]' + types: [rst] + files: ^doc/source/ + - id: seed-check-asv + name: Check for unnecessary random seeds in asv benchmarks + language: pygrep + entry: 'np\.random\.seed' + files: ^asv_bench/benchmarks + exclude: ^asv_bench/benchmarks/pandas_vb_common\.py + - id: np-testing-array-equal + name: Check for usage of numpy testing or array_equal + language: pygrep + entry: '(numpy|np)(\.testing|\.array_equal)' + files: ^pandas/tests/ + types: [python] + - id: invalid-ea-testing + name: Check for invalid EA testing + language: pygrep + entry: 'tm\.assert_(series|frame)_equal' + files: ^pandas/tests/extension/base + types: [python] + exclude: ^pandas/tests/extension/base/base\.py - id: pip-to-conda name: Generate pip dependency from conda description: This hook checks if the conda environment.yml and requirements-dev.txt are equal @@ -110,7 +183,7 @@ repos: entry: python scripts/generate_pip_deps_from_conda.py files: ^(environment.yml|requirements-dev.txt)$ pass_filenames: false - additional_dependencies: [pyyaml] + additional_dependencies: [pyyaml, toml] - id: sync-flake8-versions name: Check flake8 version is synced across flake8, yesqa, and environment.yml language: python @@ -131,8 +204,51 @@ repos: files: ^pandas/core/ exclude: ^pandas/core/api\.py$ types: [python] + - id: use-io-common-urlopen + name: Use pandas.io.common.urlopen instead of urllib.request.urlopen + language: python + entry: python scripts/use_io_common_urlopen.py + files: ^pandas/ + exclude: ^pandas/tests/ + types: [python] - id: no-bool-in-core-generic name: Use bool_t instead of bool in pandas/core/generic.py entry: python scripts/no_bool_in_generic.py language: python files: ^pandas/core/generic\.py$ + - id: pandas-errors-documented + name: Ensure pandas errors are documented in doc/source/reference/testing.rst + entry: python scripts/pandas_errors_documented.py + language: python + files: ^pandas/errors/__init__.py$ + - id: pg8000-not-installed-CI + name: Check for pg8000 not installed on CI for test_pg8000_sqlalchemy_passthrough_error + language: pygrep + entry: 'pg8000' + files: ^ci/deps + types: [yaml] + - id: validate-min-versions-in-sync + name: Check minimum version of dependencies are aligned + entry: python scripts/validate_min_versions_in_sync.py + language: python + files: ^(ci/deps/actions-.*-minimum_versions\.yaml|pandas/compat/_optional\.py)$ + - id: flake8-pyi + name: flake8-pyi + entry: flake8 --extend-ignore=E301,E302,E305,E701,E704 + types: [pyi] + language: python + additional_dependencies: + - flake8==5.0.4 + - flake8-pyi==22.8.1 + - id: future-annotations + name: import annotations from __future__ + entry: 'from __future__ import annotations' + language: pygrep + args: [--negate] + files: ^pandas/ + types: [python] + exclude: | + (?x) + /(__init__\.py)|(api\.py)|(_version\.py)|(testing\.py)|(conftest\.py)$ + |/tests/ + |/_testing/ diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000000..0161dfa92fdef --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,10 @@ +cff-version: 1.2.0 +title: 'pandas-dev/pandas: Pandas' +message: 'If you use this software, please cite it as below.' +authors: + - name: "The pandas development team" +license: BSD-3-Clause +license-url: "https://github.com/pandas-dev/pandas/blob/main/LICENSE" +repository-code: "https://github.com/pandas-dev/pandas" +type: software +url: "https://github.com/pandas-dev/pandas" diff --git a/Dockerfile b/Dockerfile index de1c564921de9..7230dcab20f6e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,48 +1,13 @@ -FROM quay.io/condaforge/miniforge3 +FROM python:3.10.8 +WORKDIR /home/pandas -# if you forked pandas, you can pass in your own GitHub username to use your fork -# i.e. gh_username=myname -ARG gh_username=pandas-dev -ARG pandas_home="/home/pandas" +RUN apt-get update && apt-get -y upgrade +RUN apt-get install -y build-essential -# Avoid warnings by switching to noninteractive -ENV DEBIAN_FRONTEND=noninteractive +# hdf5 needed for pytables installation +RUN apt-get install -y libhdf5-dev -# Configure apt and install packages -RUN apt-get update \ - && apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \ - # - # Verify git, process tools, lsb-release (common in install instructions for CLIs) installed - && apt-get -y install git iproute2 procps iproute2 lsb-release \ - # - # cleanup - && apt-get autoremove -y \ - && apt-get clean -y \ - && rm -rf /var/lib/apt/lists/* - -# Switch back to dialog for any ad-hoc use of apt-get -ENV DEBIAN_FRONTEND=dialog - -# Clone pandas repo -RUN mkdir "$pandas_home" \ - && git clone "https://github.com/$gh_username/pandas.git" "$pandas_home" \ - && cd "$pandas_home" \ - && git remote add upstream "https://github.com/pandas-dev/pandas.git" \ - && git pull upstream master - -# Because it is surprisingly difficult to activate a conda environment inside a DockerFile -# (from personal experience and per https://github.com/ContinuumIO/docker-images/issues/89), -# we just update the base/root one from the 'environment.yml' file instead of creating a new one. -# -# Set up environment -RUN conda install -y mamba -RUN mamba env update -n base -f "$pandas_home/environment.yml" - -# Build C extensions and pandas -SHELL ["/bin/bash", "-c"] -RUN . /opt/conda/etc/profile.d/conda.sh \ - && conda activate base \ - && cd "$pandas_home" \ - && export \ - && python setup.py build_ext -j 4 \ - && python -m pip install -e . +RUN python -m pip install --upgrade pip +RUN python -m pip install \ + -r https://raw.githubusercontent.com/pandas-dev/pandas/main/requirements-dev.txt +CMD ["/bin/bash"] diff --git a/LICENSE b/LICENSE index a0cc369f725b8..d4e49a140f1cb 100644 --- a/LICENSE +++ b/LICENSE @@ -3,7 +3,7 @@ BSD 3-Clause License Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team All rights reserved. -Copyright (c) 2011-2021, Open source contributors. +Copyright (c) 2011-2022, Open source contributors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/LICENSES/KLIB_LICENSE b/LICENSES/KLIB_LICENSE new file mode 100644 index 0000000000000..0a996fae3360f --- /dev/null +++ b/LICENSES/KLIB_LICENSE @@ -0,0 +1,23 @@ +The MIT License + +Copyright (c) 2008- Attractive Chaos + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/LICENSES/OTHER b/LICENSES/OTHER index f0550b4ee208a..7446d68eb43a6 100644 --- a/LICENSES/OTHER +++ b/LICENSES/OTHER @@ -1,8 +1,3 @@ -numpydoc license ----------------- - -The numpydoc license is in pandas/doc/sphinxext/LICENSE.txt - Bottleneck license ------------------ @@ -77,4 +72,4 @@ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/LICENSES/ULTRAJSON_LICENSE b/LICENSES/ULTRAJSON_LICENSE index 3b2886eb9cfae..a905fb017d813 100644 --- a/LICENSES/ULTRAJSON_LICENSE +++ b/LICENSES/ULTRAJSON_LICENSE @@ -28,7 +28,7 @@ Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. -Numeric decoder derived from from TCL library +Numeric decoder derived from TCL library http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms * Copyright (c) 1988-1993 The Regents of the University of California. * Copyright (c) 1994 Sun Microsystems, Inc. diff --git a/MANIFEST.in b/MANIFEST.in index d0d93f2cdba8c..d2b1b8cb887bc 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include RELEASE.md +include versioneer.py graft doc prune doc/build @@ -17,33 +18,43 @@ global-exclude *.h5 global-exclude *.html global-exclude *.json global-exclude *.jsonl +global-exclude *.msgpack global-exclude *.pdf global-exclude *.pickle global-exclude *.png global-exclude *.pptx -global-exclude *.pyc -global-exclude *.pyd global-exclude *.ods global-exclude *.odt +global-exclude *.orc global-exclude *.sas7bdat global-exclude *.sav global-exclude *.so global-exclude *.xls +global-exclude *.xlsb global-exclude *.xlsm global-exclude *.xlsx global-exclude *.xpt +global-exclude *.cpt global-exclude *.xz global-exclude *.zip +global-exclude *.zst global-exclude *~ global-exclude .DS_Store global-exclude .git* global-exclude \#* +global-exclude *.c +global-exclude *.cpp +global-exclude *.h + +global-exclude *.py[ocd] +global-exclude *.pxi + # GH 39321 # csv_dir_path fixture checks the existence of the directory # exclude the whole directory to avoid running related tests in sdist prune pandas/tests/io/parser/data -include versioneer.py -include pandas/_version.py -include pandas/io/formats/templates/*.tpl +# Selectively re-add *.cxx files that were excluded above +graft pandas/_libs/src +graft pandas/_libs/tslibs/src diff --git a/Makefile b/Makefile index 1fdd3cfdcf027..c0aa685ed47ac 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ build: clean_pyc python setup.py build_ext lint-diff: - git diff upstream/master --name-only -- "*.py" | xargs flake8 + git diff upstream/main --name-only -- "*.py" | xargs flake8 black: black . diff --git a/README.md b/README.md index 04b346c198e90..aaf63ead9c416 100644 --- a/README.md +++ b/README.md @@ -9,10 +9,9 @@ [![Conda Latest Release](https://anaconda.org/conda-forge/pandas/badges/version.svg)](https://anaconda.org/anaconda/pandas/) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3509134.svg)](https://doi.org/10.5281/zenodo.3509134) [![Package Status](https://img.shields.io/pypi/status/pandas.svg)](https://pypi.org/project/pandas/) -[![License](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/master/LICENSE) -[![Azure Build Status](https://dev.azure.com/pandas-dev/pandas/_apis/build/status/pandas-dev.pandas?branch=master)](https://dev.azure.com/pandas-dev/pandas/_build/latest?definitionId=1&branch=master) -[![Coverage](https://codecov.io/github/pandas-dev/pandas/coverage.svg?branch=master)](https://codecov.io/gh/pandas-dev/pandas) -[![Downloads](https://anaconda.org/conda-forge/pandas/badges/downloads.svg)](https://pandas.pydata.org) +[![License](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/main/LICENSE) +[![Coverage](https://codecov.io/github/pandas-dev/pandas/coverage.svg?branch=main)](https://codecov.io/gh/pandas-dev/pandas) +[![Downloads](https://static.pepy.tech/personalized-badge/pandas?period=month&units=international_system&left_color=black&right_color=orange&left_text=PyPI%20downloads%20per%20month)](https://pepy.tech/project/pandas) [![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/pydata/pandas) [![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://numfocus.org) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) @@ -136,7 +135,7 @@ or alternatively python setup.py develop ``` -See the full instructions for [installing from source](https://pandas.pydata.org/pandas-docs/stable/install.html#installing-from-source). +See the full instructions for [installing from source](https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html#installing-from-source). ## License [BSD 3](LICENSE) @@ -160,7 +159,7 @@ Most development discussions take place on GitHub in this repo. Further, the [pa All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome. -A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas.pydata.org/docs/dev/development/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub. +A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas.pydata.org/docs/dev/development/contributing.html)**. If you are simply looking to start working with the pandas codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [good first issue](https://github.com/pandas-dev/pandas/issues?labels=good+first+issue&sort=updated&state=open) where you could start out. @@ -170,4 +169,4 @@ Or maybe through using pandas you have an idea of your own or are looking for so Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas). -As contributors and maintainers to this project, you are expected to abide by pandas' code of conduct. More information can be found at: [Contributor Code of Conduct](https://github.com/pandas-dev/pandas/blob/master/.github/CODE_OF_CONDUCT.md) +As contributors and maintainers to this project, you are expected to abide by pandas' code of conduct. More information can be found at: [Contributor Code of Conduct](https://github.com/pandas-dev/.github/blob/master/CODE_OF_CONDUCT.md) diff --git a/RELEASE.md b/RELEASE.md index 42cb82dfcf020..344a097a3e81e 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,6 +1,6 @@ Release Notes ============= -The list of changes to Pandas between each release can be found +The list of changes to pandas between each release can be found [here](https://pandas.pydata.org/pandas-docs/stable/whatsnew/index.html). For full details, see the commit logs at https://github.com/pandas-dev/pandas. diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index e8e82edabbfa3..b1ea2682b7ea7 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -13,6 +13,10 @@ // benchmarked "repo": "..", + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "default" (for mercurial). + "branches": ["main"], + // The tool to use to create environments. May be "conda", // "virtualenv" or other value depending on the plugins in use. // If missing or the empty string, the tool will be automatically @@ -25,7 +29,6 @@ // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. - // "pythons": ["2.7", "3.4"], "pythons": ["3.8"], // The matrix of dependencies to test. Each key is the name of a @@ -39,24 +42,21 @@ // followed by the pip installed packages). "matrix": { "numpy": [], - "Cython": ["0.29.21"], + "Cython": ["0.29.32"], "matplotlib": [], "sqlalchemy": [], "scipy": [], "numba": [], "numexpr": [], "pytables": [null, ""], // platform dependent, see excludes below + "pyarrow": [], "tables": [null, ""], "openpyxl": [], "xlsxwriter": [], "xlrd": [], "xlwt": [], "odfpy": [], - "pytest": [], "jinja2": [], - // If using Windows with python 2.7 and want to build using the - // mingw toolchain (rather than MSVC), uncomment the following line. - // "libpython": [], }, "conda_channels": ["defaults", "conda-forge"], // Combinations of libraries/python versions can be excluded/included diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index e48a2060a3b34..0008a589ca71f 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -34,7 +34,7 @@ class Factorize: param_names = ["unique", "sort", "dtype"] def setup(self, unique, sort, dtype): - N = 10 ** 5 + N = 10**5 string_index = tm.makeStringIndex(N) string_arrow = None if dtype == "string[pyarrow]": @@ -44,9 +44,9 @@ def setup(self, unique, sort, dtype): raise NotImplementedError data = { - "int": pd.Int64Index(np.arange(N)), - "uint": pd.UInt64Index(np.arange(N)), - "float": pd.Float64Index(np.random.randn(N)), + "int": pd.Index(np.arange(N), dtype="int64"), + "uint": pd.Index(np.arange(N), dtype="uint64"), + "float": pd.Index(np.random.randn(N), dtype="float64"), "object": string_index, "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), "datetime64[ns, tz]": pd.date_range( @@ -74,11 +74,11 @@ class Duplicated: param_names = ["unique", "keep", "dtype"] def setup(self, unique, keep, dtype): - N = 10 ** 5 + N = 10**5 data = { - "int": pd.Int64Index(np.arange(N)), - "uint": pd.UInt64Index(np.arange(N)), - "float": pd.Float64Index(np.random.randn(N)), + "int": pd.Index(np.arange(N), dtype="int64"), + "uint": pd.Index(np.arange(N), dtype="uint64"), + "float": pd.Index(np.random.randn(N), dtype="float64"), "string": tm.makeStringIndex(N), "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), "datetime64[ns, tz]": pd.date_range( @@ -97,7 +97,7 @@ def time_duplicated(self, unique, keep, dtype): class Hashing: def setup_cache(self): - N = 10 ** 5 + N = 10**5 df = pd.DataFrame( { @@ -145,7 +145,7 @@ class Quantile: param_names = ["quantile", "interpolation", "dtype"] def setup(self, quantile, interpolation, dtype): - N = 10 ** 5 + N = 10**5 data = { "int": np.arange(N), "uint": np.arange(N).astype(np.uint64), @@ -158,7 +158,7 @@ def time_quantile(self, quantile, interpolation, dtype): class SortIntegerArray: - params = [10 ** 3, 10 ** 5] + params = [10**3, 10**5] def setup(self, N): data = np.arange(N, dtype=float) diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index 296101c9f9800..16d90b9d23741 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -1,9 +1,8 @@ import numpy as np -from pandas.compat.numpy import np_version_under1p20 - from pandas import ( Categorical, + Index, NaT, Series, date_range, @@ -50,7 +49,7 @@ def setup(self, dtype): elif dtype in ["category[object]", "category[int]"]: # Note: sizes are different in this case than others - n = 5 * 10 ** 5 + n = 5 * 10**5 sample_size = 100 arr = list(np.random.randint(0, n // 10, size=n)) @@ -175,7 +174,7 @@ class IsinWithArange: def setup(self, dtype, M, offset_factor): offset = int(M * offset_factor) - tmp = Series(np.random.randint(offset, M + offset, 10 ** 6)) + tmp = Series(np.random.randint(offset, M + offset, 10**6)) self.series = tmp.astype(dtype) self.values = np.arange(M).astype(dtype) @@ -192,8 +191,8 @@ class IsInFloat64: param_names = ["dtype", "title"] def setup(self, dtype, title): - N_many = 10 ** 5 - N_few = 10 ** 6 + N_many = 10**5 + N_few = 10**6 self.series = Series([1, 2], dtype=dtype) if title == "many_different_values": @@ -241,10 +240,10 @@ class IsInForObjects: param_names = ["series_type", "vals_type"] def setup(self, series_type, vals_type): - N_many = 10 ** 5 + N_many = 10**5 if series_type == "nans": - ser_vals = np.full(10 ** 4, np.nan) + ser_vals = np.full(10**4, np.nan) elif series_type == "short": ser_vals = np.arange(2) elif series_type == "long": @@ -255,7 +254,7 @@ def setup(self, series_type, vals_type): self.series = Series(ser_vals).astype(object) if vals_type == "nans": - values = np.full(10 ** 4, np.nan) + values = np.full(10**4, np.nan) elif vals_type == "short": values = np.arange(2) elif vals_type == "long": @@ -278,11 +277,7 @@ class IsInLongSeriesLookUpDominates: param_names = ["dtype", "MaxNumber", "series_type"] def setup(self, dtype, MaxNumber, series_type): - N = 10 ** 7 - - # https://github.com/pandas-dev/pandas/issues/39844 - if not np_version_under1p20 and dtype in ("Int64", "Float64"): - raise NotImplementedError + N = 10**7 if series_type == "random_hits": array = np.random.randint(0, MaxNumber, N) @@ -294,7 +289,8 @@ def setup(self, dtype, MaxNumber, series_type): array = np.arange(N) + MaxNumber self.series = Series(array).astype(dtype) - self.values = np.arange(MaxNumber).astype(dtype) + + self.values = np.arange(MaxNumber).astype(dtype.lower()) def time_isin(self, dtypes, MaxNumber, series_type): self.series.isin(self.values) @@ -308,20 +304,39 @@ class IsInLongSeriesValuesDominate: param_names = ["dtype", "series_type"] def setup(self, dtype, series_type): - N = 10 ** 7 - - # https://github.com/pandas-dev/pandas/issues/39844 - if not np_version_under1p20 and dtype in ("Int64", "Float64"): - raise NotImplementedError + N = 10**7 if series_type == "random": vals = np.random.randint(0, 10 * N, N) if series_type == "monotone": vals = np.arange(N) - self.values = vals.astype(dtype) - M = 10 ** 6 + 1 + self.values = vals.astype(dtype.lower()) + M = 10**6 + 1 self.series = Series(np.arange(M)).astype(dtype) def time_isin(self, dtypes, series_type): self.series.isin(self.values) + + +class IsInWithLongTupples: + def setup(self): + t = tuple(range(1000)) + self.series = Series([t] * 1000) + self.values = [t] + + def time_isin(self): + self.series.isin(self.values) + + +class IsInIndexes: + def setup(self): + self.range_idx = Index(range(1000)) + self.index = Index(list(range(1000))) + self.series = Series(np.random.randint(100_000, size=1000)) + + def time_isin_range_index(self): + self.series.isin(self.range_idx) + + def time_isin_index(self): + self.series.isin(self.index) diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index bfb1be8705495..496db66c78569 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -59,7 +59,7 @@ def time_frame_op_with_scalar(self, dtype, scalar, op): class OpWithFillValue: def setup(self): # GH#31300 - arr = np.arange(10 ** 6) + arr = np.arange(10**6) df = DataFrame({"A": arr}) ser = df["A"] @@ -93,7 +93,7 @@ class MixedFrameWithSeriesAxis: param_names = ["opname"] def setup(self, opname): - arr = np.arange(10 ** 6).reshape(1000, -1) + arr = np.arange(10**6).reshape(1000, -1) df = DataFrame(arr) df["C"] = 1.0 self.df = df @@ -144,7 +144,7 @@ def setup(self, op, shape): # should already be the case, but just to be sure df._consolidate_inplace() - # TODO: GH#33198 the setting here shoudlnt need two steps + # TODO: GH#33198 the setting here shouldn't need two steps arr1 = np.random.randn(n_rows, max(n_cols // 4, 3)).astype("f8") arr2 = np.random.randn(n_rows, n_cols // 2).astype("i8") arr3 = np.random.randn(n_rows, n_cols // 4).astype("f8") @@ -201,7 +201,7 @@ def teardown(self, use_numexpr, threads): class Ops2: def setup(self): - N = 10 ** 3 + N = 10**3 self.df = DataFrame(np.random.randn(N, N)) self.df2 = DataFrame(np.random.randn(N, N)) @@ -258,7 +258,7 @@ class Timeseries: param_names = ["tz"] def setup(self, tz): - N = 10 ** 6 + N = 10**6 halfway = (N // 2) - 1 self.s = Series(date_range("20010101", periods=N, freq="T", tz=tz)) self.ts = self.s[halfway] @@ -280,7 +280,7 @@ def time_timestamp_ops_diff_with_shift(self, tz): class IrregularOps: def setup(self): - N = 10 ** 5 + N = 10**5 idx = date_range(start="1/1/2000", periods=N, freq="s") s = Series(np.random.randn(N), index=idx) self.left = s.sample(frac=1) @@ -304,7 +304,7 @@ class CategoricalComparisons: param_names = ["op"] def setup(self, op): - N = 10 ** 5 + N = 10**5 self.cat = pd.Categorical(list("aabbcd") * N, ordered=True) def time_categorical_op(self, op): @@ -317,7 +317,7 @@ class IndexArithmetic: param_names = ["dtype"] def setup(self, dtype): - N = 10 ** 6 + N = 10**6 indexes = {"int": "makeIntIndex", "float": "makeFloatIndex"} self.index = getattr(tm, indexes[dtype])(N) @@ -343,7 +343,7 @@ class NumericInferOps: param_names = ["dtype"] def setup(self, dtype): - N = 5 * 10 ** 5 + N = 5 * 10**5 self.df = DataFrame( {"A": np.arange(N).astype(dtype), "B": np.arange(N).astype(dtype)} ) @@ -367,7 +367,7 @@ def time_modulo(self, dtype): class DateInferOps: # from GH 7332 def setup_cache(self): - N = 5 * 10 ** 5 + N = 5 * 10**5 df = DataFrame({"datetime64": np.arange(N).astype("datetime64[ms]")}) df["timedelta"] = df["datetime64"] - df["datetime64"] return df @@ -388,7 +388,7 @@ class AddOverflowScalar: param_names = ["scalar"] def setup(self, scalar): - N = 10 ** 6 + N = 10**6 self.arr = np.arange(N) def time_add_overflow_scalar(self, scalar): @@ -397,7 +397,7 @@ def time_add_overflow_scalar(self, scalar): class AddOverflowArray: def setup(self): - N = 10 ** 6 + N = 10**6 self.arr = np.arange(N) self.arr_rev = np.arange(-N, 0) self.arr_mixed = np.array([1, -1]).repeat(N / 2) @@ -420,7 +420,7 @@ def time_add_overflow_both_arg_nan(self): hcal = pd.tseries.holiday.USFederalHolidayCalendar() -# These offsets currently raise a NotImplimentedError with .apply_index() +# These offsets currently raise a NotImplementedError with .apply_index() non_apply = [ pd.offsets.Day(), pd.offsets.BYearEnd(), diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 103df0fd94847..b58200911749e 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -2,6 +2,8 @@ import pandas as pd +from .pandas_vb_common import tm + class BooleanArray: def setup(self): @@ -39,3 +41,32 @@ def time_constructor(self): def time_from_integer_array(self): pd.array(self.values_integer, dtype="Int64") + + +class ArrowStringArray: + + params = [False, True] + param_names = ["multiple_chunks"] + + def setup(self, multiple_chunks): + try: + import pyarrow as pa + except ImportError: + raise NotImplementedError + strings = tm.rands_array(3, 10_000) + if multiple_chunks: + chunks = [strings[i : i + 100] for i in range(0, len(strings), 100)] + self.array = pd.arrays.ArrowStringArray(pa.chunked_array(chunks)) + else: + self.array = pd.arrays.ArrowStringArray(pa.array(strings)) + + def time_setitem(self, multiple_chunks): + for i in range(200): + self.array[i] = "foo" + + def time_setitem_list(self, multiple_chunks): + indexer = list(range(0, 50)) + list(range(-50, 0)) + self.array[indexer] = ["foo"] * len(indexer) + + def time_setitem_slice(self, multiple_chunks): + self.array[::10] = "foo" diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 268f25c3d12e3..ff0b3b2fb651d 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -19,7 +19,7 @@ class Constructor: def setup(self): - N = 10 ** 5 + N = 10**5 self.categories = list("abcde") self.cat_idx = pd.Index(self.categories) self.values = np.tile(self.categories, N) @@ -71,16 +71,16 @@ def time_existing_series(self): class AsType: def setup(self): - N = 10 ** 5 + N = 10**5 random_pick = np.random.default_rng().choice categories = { "str": list(string.ascii_letters), - "int": np.random.randint(2 ** 16, size=154), + "int": np.random.randint(2**16, size=154), "float": sys.maxsize * np.random.random((38,)), "timestamp": [ - pd.Timestamp(x, unit="s") for x in np.random.randint(2 ** 18, size=578) + pd.Timestamp(x, unit="s") for x in np.random.randint(2**18, size=578) ], } @@ -112,7 +112,7 @@ def astype_datetime(self): class Concat: def setup(self): - N = 10 ** 5 + N = 10**5 self.s = pd.Series(list("aabbcd") * N).astype("category") self.a = pd.Categorical(list("aabbcd") * N) @@ -148,7 +148,7 @@ class ValueCounts: param_names = ["dropna"] def setup(self, dropna): - n = 5 * 10 ** 5 + n = 5 * 10**5 arr = [f"s{i:04d}" for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype("category") @@ -166,7 +166,7 @@ def time_rendering(self): class SetCategories: def setup(self): - n = 5 * 10 ** 5 + n = 5 * 10**5 arr = [f"s{i:04d}" for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype("category") @@ -176,7 +176,7 @@ def time_set_categories(self): class RemoveCategories: def setup(self): - n = 5 * 10 ** 5 + n = 5 * 10**5 arr = [f"s{i:04d}" for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype("category") @@ -186,8 +186,8 @@ def time_remove_categories(self): class Rank: def setup(self): - N = 10 ** 5 - ncats = 100 + N = 10**5 + ncats = 15 self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) self.s_str_cat = pd.Series(self.s_str, dtype="category") @@ -241,7 +241,7 @@ def time_categorical_series_is_monotonic_decreasing(self): class Contains: def setup(self): - N = 10 ** 5 + N = 10**5 self.ci = tm.makeCategoricalIndex(N) self.c = self.ci.values self.key = self.ci.categories[0] @@ -259,7 +259,7 @@ class CategoricalSlicing: param_names = ["index"] def setup(self, index): - N = 10 ** 6 + N = 10**6 categories = ["a", "b", "c"] values = [0] * N + [1] * N + [2] * N if index == "monotonic_incr": @@ -295,7 +295,7 @@ def time_getitem_bool_array(self, index): class Indexing: def setup(self): - N = 10 ** 5 + N = 10**5 self.index = pd.CategoricalIndex(range(N), range(N)) self.series = pd.Series(range(N), index=self.index).sort_index() self.category = self.index[500] @@ -327,7 +327,7 @@ def time_sort_values(self): class SearchSorted: def setup(self): - N = 10 ** 5 + N = 10**5 self.ci = tm.makeCategoricalIndex(N).sort_values() self.c = self.ci.values self.key = self.ci.categories[1] diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 5993b068feadf..ef8b16f376d6a 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -76,7 +76,7 @@ def setup(self, data_fmt, with_index, dtype): raise NotImplementedError( "Series constructors do not support using generators with indexes" ) - N = 10 ** 4 + N = 10**4 if dtype == "float": arr = np.random.randn(N) else: @@ -90,7 +90,7 @@ def time_series_constructor(self, data_fmt, with_index, dtype): class SeriesDtypesConstructors: def setup(self): - N = 10 ** 4 + N = 10**4 self.arr = np.random.randn(N) self.arr_str = np.array(["foo", "bar", "baz"], dtype=object) self.s = Series( @@ -114,7 +114,7 @@ def time_dtindex_from_index_with_series(self): class MultiIndexConstructor: def setup(self): - N = 10 ** 4 + N = 10**4 self.iterables = [tm.makeStringIndex(N), range(20)] def time_multiindex_from_iterables(self): diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index c561b80ed1ca6..55f6be848aa13 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -50,15 +50,26 @@ def time_pandas_dtype_invalid(self, dtype): class SelectDtypes: - params = [ - tm.ALL_INT_DTYPES - + tm.ALL_EA_INT_DTYPES - + tm.FLOAT_DTYPES - + tm.COMPLEX_DTYPES - + tm.DATETIME64_DTYPES - + tm.TIMEDELTA64_DTYPES - + tm.BOOL_DTYPES - ] + try: + params = [ + tm.ALL_INT_NUMPY_DTYPES + + tm.ALL_INT_EA_DTYPES + + tm.FLOAT_NUMPY_DTYPES + + tm.COMPLEX_DTYPES + + tm.DATETIME64_DTYPES + + tm.TIMEDELTA64_DTYPES + + tm.BOOL_DTYPES + ] + except AttributeError: + params = [ + tm.ALL_INT_DTYPES + + tm.ALL_EA_INT_DTYPES + + tm.FLOAT_DTYPES + + tm.COMPLEX_DTYPES + + tm.DATETIME64_DTYPES + + tm.TIMEDELTA64_DTYPES + + tm.BOOL_DTYPES + ] param_names = ["dtype"] def setup(self, dtype): diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index cbab9fdc9c0ba..b5442531e748a 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -43,7 +43,7 @@ def teardown(self, engine, threads): class Query: def setup(self): - N = 10 ** 6 + N = 10**6 halfway = (N // 2) - 1 index = pd.date_range("20010101", periods=N, freq="T") s = pd.Series(index) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 7fbe249788a98..20c0c0ea2f6fe 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -2,7 +2,10 @@ import pandas as pd from pandas import ( + NA, + Categorical, DataFrame, + Float64Dtype, MultiIndex, Series, Timestamp, @@ -18,7 +21,10 @@ ) except ImportError: # For compatibility with older versions - from pandas.core.datetools import * # noqa + from pandas.core.datetools import ( + Hour, + Nano, + ) class FromDicts: @@ -31,6 +37,9 @@ def setup(self): self.dict_list = frame.to_dict(orient="records") self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)} + # arrays which we won't consolidate + self.dict_of_categoricals = {i: Categorical(np.arange(N)) for i in range(K)} + def time_list_of_dict(self): DataFrame(self.dict_list) @@ -50,6 +59,10 @@ def time_nested_dict_int64(self): # nested dict, integer indexes, regression described in #621 DataFrame(self.data2) + def time_dict_of_categoricals(self): + # dict of arrays that we won't consolidate + DataFrame(self.dict_of_categoricals) + class FromSeries: def setup(self): @@ -66,7 +79,7 @@ class FromDictwithTimestamp: param_names = ["offset"] def setup(self, offset): - N = 10 ** 3 + N = 10**3 idx = date_range(Timestamp("1/1/1900"), freq=offset, periods=N) df = DataFrame(np.random.randn(N, 10), index=idx) self.d = df.to_dict() @@ -127,6 +140,27 @@ def time_frame_from_range(self): self.df = DataFrame(self.data) +class FromScalar: + def setup(self): + self.nrows = 100_000 + + def time_frame_from_scalar_ea_float64(self): + DataFrame( + 1.0, + index=range(self.nrows), + columns=list("abc"), + dtype=Float64Dtype(), + ) + + def time_frame_from_scalar_ea_float64_na(self): + DataFrame( + NA, + index=range(self.nrows), + columns=list("abc"), + dtype=Float64Dtype(), + ) + + class FromArrays: goal_time = 0.2 @@ -171,4 +205,21 @@ def time_frame_from_arrays_sparse(self): ) +class From3rdParty: + # GH#44616 + + def setup(self): + try: + import torch + except ImportError: + raise NotImplementedError + + row = 700000 + col = 64 + self.val_tensor = torch.randn(row, col) + + def time_from_torch(self): + DataFrame(self.val_tensor) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index c32eda4928da7..a28e20a636ce2 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -50,7 +50,7 @@ def time_frame_fancy_lookup_all(self): class Reindex: def setup(self): - N = 10 ** 3 + N = 10**3 self.df = DataFrame(np.random.randn(N * 10, N)) self.idx = np.arange(4 * N, 7 * N) self.idx_cols = np.random.randint(0, N, N) @@ -76,7 +76,7 @@ def time_reindex_axis1_missing(self): self.df.reindex(columns=self.idx) def time_reindex_both_axes(self): - self.df.reindex(index=self.idx, columns=self.idx) + self.df.reindex(index=self.idx, columns=self.idx_cols) def time_reindex_upcast(self): self.df2.reindex(np.random.permutation(range(1200))) @@ -84,7 +84,7 @@ def time_reindex_upcast(self): class Rename: def setup(self): - N = 10 ** 3 + N = 10**3 self.df = DataFrame(np.random.randn(N * 10, N)) self.idx = np.arange(4 * N, 7 * N) self.dict_idx = {k: k for k in self.idx} @@ -232,6 +232,22 @@ def time_to_html_mixed(self): self.df2.to_html() +class ToDict: + params = [["dict", "list", "series", "split", "records", "index"]] + param_names = ["orient"] + + def setup(self, orient): + data = np.random.randint(0, 1000, size=(10000, 4)) + self.int_df = DataFrame(data) + self.datetimelike_df = self.int_df.astype("timedelta64[ns]") + + def time_to_dict_ints(self, orient): + self.int_df.to_dict(orient=orient) + + def time_to_dict_datetimelike(self, orient): + self.datetimelike_df.to_dict(orient=orient) + + class ToNumpy: def setup(self): N = 10000 @@ -272,6 +288,26 @@ def time_values_mixed_wide(self): self.df_mixed_wide.values +class ToRecords: + def setup(self): + N = 100_000 + data = np.random.randn(N, 2) + mi = MultiIndex.from_arrays( + [ + np.arange(N), + date_range("1970-01-01", periods=N, freq="ms"), + ] + ) + self.df = DataFrame(data) + self.df_mi = DataFrame(data, index=mi) + + def time_to_records(self): + self.df.to_records(index=True) + + def time_to_records_multiindex(self): + self.df_mi.to_records(index=True) + + class Repr: def setup(self): nrows = 10000 @@ -313,7 +349,7 @@ def time_frame_mask_floats(self): class Isnull: def setup(self): - N = 10 ** 3 + N = 10**3 self.df_no_null = DataFrame(np.random.randn(N, N)) sample = np.array([np.nan, 1.0]) @@ -481,7 +517,7 @@ def time_frame_dtypes(self): class Equals: def setup(self): - N = 10 ** 3 + N = 10**3 self.float_df = DataFrame(np.random.randn(N, N)) self.float_df_nan = self.float_df.copy() self.float_df_nan.iloc[-1, -1] = np.nan @@ -522,8 +558,12 @@ class Interpolate: def setup(self, downcast): N = 10000 # this is the worst case, where every column has NaNs. - self.df = DataFrame(np.random.randn(N, 100)) - self.df.values[::2] = np.nan + arr = np.random.randn(N, 100) + # NB: we need to set values in array, not in df.values, otherwise + # the benchmark will be misleading for ArrayManager + arr[::2] = np.nan + + self.df = DataFrame(arr) self.df2 = DataFrame( { @@ -591,6 +631,9 @@ def time_frame_duplicated(self): def time_frame_duplicated_wide(self): self.df2.duplicated() + def time_frame_duplicated_subset(self): + self.df.duplicated(subset=["a"]) + class XS: @@ -598,7 +641,7 @@ class XS: param_names = ["axis"] def setup(self, axis): - self.N = 10 ** 4 + self.N = 10**4 self.df = DataFrame(np.random.randn(self.N, self.N)) def time_frame_xs(self, axis): @@ -698,9 +741,9 @@ class Describe: def setup(self): self.df = DataFrame( { - "a": np.random.randint(0, 100, 10 ** 6), - "b": np.random.randint(0, 100, 10 ** 6), - "c": np.random.randint(0, 100, 10 ** 6), + "a": np.random.randint(0, 100, 10**6), + "b": np.random.randint(0, 100, 10**6), + "c": np.random.randint(0, 100, 10**6), } ) @@ -711,17 +754,6 @@ def time_dataframe_describe(self): self.df.describe() -class SelectDtypes: - params = [100, 1000] - param_names = ["n"] - - def setup(self, n): - self.df = DataFrame(np.random.randn(10, n)) - - def time_select_dtypes(self, n): - self.df.select_dtypes(include="int") - - class MemoryUsage: def setup(self): self.df = DataFrame(np.random.randn(100000, 2), columns=list("AB")) diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index ac7cd87c846d5..31654a5c75617 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -1,3 +1,6 @@ +from functools import wraps +import threading + import numpy as np from pandas import ( @@ -30,21 +33,57 @@ from pandas._libs import algos except ImportError: from pandas import algos -try: - from pandas._testing import test_parallel # noqa: PDF014 - have_real_test_parallel = True -except ImportError: - have_real_test_parallel = False - def test_parallel(num_threads=1): - def wrapper(fname): - return fname +from .pandas_vb_common import BaseIO # isort:skip - return wrapper +def test_parallel(num_threads=2, kwargs_list=None): + """ + Decorator to run the same function multiple times in parallel. -from .pandas_vb_common import BaseIO # isort:skip + Parameters + ---------- + num_threads : int, optional + The number of times the function is run in parallel. + kwargs_list : list of dicts, optional + The list of kwargs to update original + function kwargs on different threads. + + Notes + ----- + This decorator does not pass the return value of the decorated function. + + Original from scikit-image: + + https://github.com/scikit-image/scikit-image/pull/1519 + + """ + assert num_threads > 0 + has_kwargs_list = kwargs_list is not None + if has_kwargs_list: + assert len(kwargs_list) == num_threads + + def wrapper(func): + @wraps(func) + def inner(*args, **kwargs): + if has_kwargs_list: + update_kwargs = lambda i: dict(kwargs, **kwargs_list[i]) + else: + update_kwargs = lambda i: kwargs + threads = [] + for i in range(num_threads): + updated_kwargs = update_kwargs(i) + thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs) + threads.append(thread) + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + return inner + + return wrapper class ParallelGroupbyMethods: @@ -53,10 +92,9 @@ class ParallelGroupbyMethods: param_names = ["threads", "method"] def setup(self, threads, method): - if not have_real_test_parallel: - raise NotImplementedError - N = 10 ** 6 - ngroups = 10 ** 3 + + N = 10**6 + ngroups = 10**3 df = DataFrame( {"key": np.random.randint(0, ngroups, size=N), "data": np.random.randn(N)} ) @@ -86,10 +124,9 @@ class ParallelGroups: param_names = ["threads"] def setup(self, threads): - if not have_real_test_parallel: - raise NotImplementedError - size = 2 ** 22 - ngroups = 10 ** 3 + + size = 2**22 + ngroups = 10**3 data = Series(np.random.randint(0, ngroups, size=size)) @test_parallel(num_threads=threads) @@ -108,9 +145,8 @@ class ParallelTake1D: param_names = ["dtype"] def setup(self, dtype): - if not have_real_test_parallel: - raise NotImplementedError - N = 10 ** 6 + + N = 10**6 df = DataFrame({"col": np.arange(N, dtype=dtype)}) indexer = np.arange(100, len(df) - 100) @@ -131,10 +167,9 @@ class ParallelKth: repeat = 5 def setup(self): - if not have_real_test_parallel: - raise NotImplementedError - N = 10 ** 7 - k = 5 * 10 ** 5 + + N = 10**7 + k = 5 * 10**5 kwargs_list = [{"arr": np.random.randn(N)}, {"arr": np.random.randn(N)}] @test_parallel(num_threads=2, kwargs_list=kwargs_list) @@ -149,9 +184,8 @@ def time_kth_smallest(self): class ParallelDatetimeFields: def setup(self): - if not have_real_test_parallel: - raise NotImplementedError - N = 10 ** 6 + + N = 10**6 self.dti = date_range("1900-01-01", periods=N, freq="T") self.period = self.dti.to_period("D") @@ -204,8 +238,7 @@ class ParallelRolling: param_names = ["method"] def setup(self, method): - if not have_real_test_parallel: - raise NotImplementedError + win = 100 arr = np.random.rand(100000) if hasattr(DataFrame, "rolling"): @@ -248,8 +281,7 @@ class ParallelReadCSV(BaseIO): param_names = ["dtype"] def setup(self, dtype): - if not have_real_test_parallel: - raise NotImplementedError + rows = 10000 cols = 50 data = { @@ -284,8 +316,6 @@ class ParallelFactorize: param_names = ["threads"] def setup(self, threads): - if not have_real_test_parallel: - raise NotImplementedError strings = tm.makeStringIndex(100000) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 1648985a56b91..2de1f25fceace 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -7,6 +7,7 @@ from pandas import ( Categorical, DataFrame, + Index, MultiIndex, Series, Timestamp, @@ -18,6 +19,7 @@ method_blocklist = { "object": { + "diff", "median", "prod", "sem", @@ -73,7 +75,7 @@ class Apply: params = [4, 5] def setup(self, factor): - N = 10 ** factor + N = 10**factor # two cases: # - small groups: small data (N**4) + many labels (2000) -> average group # size of 5 (-> larger overhead of slicing method) @@ -110,13 +112,25 @@ def time_copy_overhead_single_col(self, factor): self.df.groupby("key").apply(self.df_copy_function) +class ApplyNonUniqueUnsortedIndex: + def setup(self): + # GH 46527 + # unsorted and non-unique index + idx = np.arange(100)[::-1] + idx = Index(np.repeat(idx, 200), name="key") + self.df = DataFrame(np.random.randn(len(idx), 10), index=idx) + + def time_groupby_apply_non_unique_unsorted_index(self): + self.df.groupby("key", group_keys=False).apply(lambda x: x) + + class Groups: param_names = ["key"] params = ["int64_small", "int64_large", "object_small", "object_large"] def setup_cache(self): - size = 10 ** 6 + size = 10**6 data = { "int64_small": Series(np.random.randint(0, 100, size=size)), "int64_large": Series(np.random.randint(0, 10000, size=size)), @@ -160,7 +174,7 @@ class Nth: params = ["float32", "float64", "datetime", "object"] def setup(self, dtype): - N = 10 ** 5 + N = 10**5 # with datetimes (GH7555) if dtype == "datetime": values = date_range("1/1/2011", periods=N, freq="s") @@ -268,7 +282,7 @@ def time_multi_int_nunique(self, df): class AggFunctions: def setup_cache(self): - N = 10 ** 5 + N = 10**5 fac1 = np.array(["A", "B", "C"], dtype="O") fac2 = np.array(["one", "two"], dtype="O") df = DataFrame( @@ -301,7 +315,7 @@ def time_different_python_functions_singlecol(self, df): class GroupStrings: def setup(self): - n = 2 * 10 ** 5 + n = 2 * 10**5 alpha = list(map("".join, product(ascii_letters, repeat=4))) data = np.random.choice(alpha, (n // 5, 4), replace=False) data = np.repeat(data, 5, axis=0) @@ -315,7 +329,7 @@ def time_multi_columns(self): class MultiColumn: def setup_cache(self): - N = 10 ** 5 + N = 10**5 key1 = np.tile(np.arange(100, dtype=object), 1000) key2 = key1.copy() np.random.shuffle(key1) @@ -345,7 +359,7 @@ def time_col_select_numpy_sum(self, df): class Size: def setup(self): - n = 10 ** 5 + n = 10**5 offsets = np.random.randint(n, size=n).astype("timedelta64[ns]") dates = np.datetime64("now") + offsets self.df = DataFrame( @@ -369,6 +383,18 @@ def time_category_size(self): self.draws.groupby(self.cats).size() +class Shift: + def setup(self): + N = 18 + self.df = DataFrame({"g": ["a", "b"] * 9, "v": list(range(N))}) + + def time_defaults(self): + self.df.groupby("g").shift() + + def time_fill_value(self): + self.df.groupby("g").shift(fill_value=99) + + class FillNA: def setup(self): N = 100 @@ -391,9 +417,9 @@ def time_srs_bfill(self): class GroupByMethods: - param_names = ["dtype", "method", "application"] + param_names = ["dtype", "method", "application", "ncols"] params = [ - ["int", "float", "object", "datetime", "uint"], + ["int", "int16", "float", "object", "datetime", "uint"], [ "all", "any", @@ -405,6 +431,7 @@ class GroupByMethods: "cumprod", "cumsum", "describe", + "diff", "ffill", "first", "head", @@ -431,18 +458,42 @@ class GroupByMethods: "var", ], ["direct", "transformation"], + [1, 5], ] - def setup(self, dtype, method, application): + def setup(self, dtype, method, application, ncols): if method in method_blocklist.get(dtype, {}): raise NotImplementedError # skip benchmark - ngroups = 1000 + + if ncols != 1 and method in ["value_counts", "unique"]: + # DataFrameGroupBy doesn't have these methods + raise NotImplementedError + + if application == "transformation" and method in [ + "describe", + "head", + "tail", + "unique", + "value_counts", + "size", + ]: + # DataFrameGroupBy doesn't have these methods + raise NotImplementedError + + if method == "describe": + ngroups = 20 + elif method in ["mad", "skew"]: + ngroups = 100 + else: + ngroups = 1000 size = ngroups * 2 - rng = np.arange(ngroups) - values = rng.take(np.random.randint(0, ngroups, size=size)) + rng = np.arange(ngroups).reshape(-1, 1) + rng = np.broadcast_to(rng, (len(rng), ncols)) + taker = np.random.randint(0, ngroups, size=size) + values = rng.take(taker, axis=0) if dtype == "int": key = np.random.randint(0, size, size=size) - elif dtype == "uint": + elif dtype in ("int16", "uint"): key = np.random.randint(0, size, size=size, dtype=dtype) elif dtype == "float": key = np.concatenate( @@ -453,28 +504,30 @@ def setup(self, dtype, method, application): elif dtype == "datetime": key = date_range("1/1/2011", periods=size, freq="s") - df = DataFrame({"values": values, "key": key}) + cols = [f"values{n}" for n in range(ncols)] + df = DataFrame(values, columns=cols) + df["key"] = key - if application == "transform": - if method == "describe": - raise NotImplementedError + if len(cols) == 1: + cols = cols[0] - self.as_group_method = lambda: df.groupby("key")["values"].transform(method) - self.as_field_method = lambda: df.groupby("values")["key"].transform(method) + if application == "transformation": + self.as_group_method = lambda: df.groupby("key")[cols].transform(method) + self.as_field_method = lambda: df.groupby(cols)["key"].transform(method) else: - self.as_group_method = getattr(df.groupby("key")["values"], method) - self.as_field_method = getattr(df.groupby("values")["key"], method) + self.as_group_method = getattr(df.groupby("key")[cols], method) + self.as_field_method = getattr(df.groupby(cols)["key"], method) - def time_dtype_as_group(self, dtype, method, application): + def time_dtype_as_group(self, dtype, method, application, ncols): self.as_group_method() - def time_dtype_as_field(self, dtype, method, application): + def time_dtype_as_field(self, dtype, method, application, ncols): self.as_field_method() class GroupByCythonAgg: """ - Benchmarks specifically targetting our cython aggregation algorithms + Benchmarks specifically targeting our cython aggregation algorithms (using a big enough dataframe with simple key, so a large part of the time is actually spent in the grouped aggregation). """ @@ -544,7 +597,7 @@ class RankWithTies: ] def setup(self, dtype, tie_method): - N = 10 ** 4 + N = 10**4 if dtype == "datetime64": data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype) else: @@ -568,9 +621,41 @@ def time_sum(self): self.df.groupby(["a"])["b"].sum() +class String: + # GH#41596 + param_names = ["dtype", "method"] + params = [ + ["str", "string[python]"], + [ + "sum", + "prod", + "min", + "max", + "mean", + "median", + "var", + "first", + "last", + "any", + "all", + ], + ] + + def setup(self, dtype, method): + cols = list("abcdefghjkl") + self.df = DataFrame( + np.random.randint(0, 100, size=(1_000_000, len(cols))), + columns=cols, + dtype=dtype, + ) + + def time_str_func(self, dtype, method): + self.df.groupby("a")[self.df.columns[1:]].agg(method) + + class Categories: def setup(self): - N = 10 ** 5 + N = 10**5 arr = np.random.random(N) data = {"a": Categorical(np.random.randint(10000, size=N)), "b": arr} self.df = DataFrame(data) @@ -612,14 +697,14 @@ class Datelike: param_names = ["grouper"] def setup(self, grouper): - N = 10 ** 4 + N = 10**4 rng_map = { "period_range": period_range, "date_range": date_range, "date_range_tz": partial(date_range, tz="US/Central"), } self.grouper = rng_map[grouper]("1900-01-01", freq="D", periods=N) - self.df = DataFrame(np.random.randn(10 ** 4, 2)) + self.df = DataFrame(np.random.randn(10**4, 2)) def time_sum(self, grouper): self.df.groupby(self.grouper).sum() @@ -665,6 +750,18 @@ def setup(self): data = DataFrame(arr, index=index, columns=["col1", "col20", "col3"]) self.df = data + n = 1000 + self.df_wide = DataFrame( + np.random.randn(n, n), + index=np.random.choice(range(10), n), + ) + + n = 1_000_000 + self.df_tall = DataFrame( + np.random.randn(n, 3), + index=np.random.randint(0, 5, n), + ) + n = 20000 self.df1 = DataFrame( np.random.randint(1, n, (n, 3)), columns=["jim", "joe", "jolie"] @@ -684,6 +781,12 @@ def time_transform_lambda_max(self): def time_transform_ufunc_max(self): self.df.groupby(level="lev1").transform(np.max) + def time_transform_lambda_max_tall(self): + self.df_tall.groupby(level=0).transform(lambda x: np.max(x, axis=0)) + + def time_transform_lambda_max_wide(self): + self.df_wide.groupby(level=0).transform(lambda x: np.max(x, axis=0)) + def time_transform_multi_key1(self): self.df1.groupby(["jim", "joe"])["jolie"].transform("max") @@ -728,7 +831,7 @@ class TransformEngine: params = [[True, False]] def setup(self, parallel): - N = 10 ** 3 + N = 10**3 data = DataFrame( {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}, columns=[0, 1], @@ -771,7 +874,7 @@ class AggEngine: params = [[True, False]] def setup(self, parallel): - N = 10 ** 3 + N = 10**3 data = DataFrame( {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}, columns=[0, 1], @@ -832,4 +935,18 @@ def function(values): self.grouper.agg(function, engine="cython") +class Sample: + def setup(self): + N = 10**3 + self.df = DataFrame({"a": np.zeros(N)}) + self.groups = np.arange(0, N) + self.weights = np.ones(N) + + def time_sample(self): + self.df.groupby(self.groups).sample(n=1) + + def time_sample_weights(self): + self.df.groupby(self.groups).sample(n=1, weights=self.weights) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py index 6703cc791493a..da752b902b4fd 100644 --- a/asv_bench/benchmarks/hash_functions.py +++ b/asv_bench/benchmarks/hash_functions.py @@ -16,9 +16,9 @@ class Float64GroupIndex: # GH28303 def setup(self): self.df = pd.date_range( - start="1/1/2018", end="1/2/2018", periods=10 ** 6 + start="1/1/2018", end="1/2/2018", periods=10**6 ).to_frame() - self.group_index = np.round(self.df.index.astype(int) / 10 ** 9) + self.group_index = np.round(self.df.index.astype(int) / 10**9) def time_groupby(self): self.df.groupby(self.group_index).last() @@ -29,8 +29,8 @@ class UniqueAndFactorizeArange: param_names = ["exponent"] def setup(self, exponent): - a = np.arange(10 ** 4, dtype="float64") - self.a2 = (a + 10 ** exponent).repeat(100) + a = np.arange(10**4, dtype="float64") + self.a2 = (a + 10**exponent).repeat(100) def time_factorize(self, exponent): pd.factorize(self.a2) @@ -39,11 +39,26 @@ def time_unique(self, exponent): pd.unique(self.a2) +class Unique: + params = ["Int64", "Float64"] + param_names = ["dtype"] + + def setup(self, dtype): + self.ser = pd.Series(([1, pd.NA, 2] + list(range(100_000))) * 3, dtype=dtype) + self.ser_unique = pd.Series(list(range(300_000)) + [pd.NA], dtype=dtype) + + def time_unique_with_duplicates(self, exponent): + pd.unique(self.ser) + + def time_unique(self, exponent): + pd.unique(self.ser_unique) + + class NumericSeriesIndexing: params = [ (pd.Int64Index, pd.UInt64Index, pd.Float64Index), - (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6), + (10**4, 10**5, 5 * 10**5, 10**6, 5 * 10**6), ] param_names = ["index_dtype", "N"] @@ -61,7 +76,7 @@ class NumericSeriesIndexingShuffled: params = [ (pd.Int64Index, pd.UInt64Index, pd.Float64Index), - (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6), + (10**4, 10**5, 5 * 10**5, 10**6, 5 * 10**6), ] param_names = ["index_dtype", "N"] diff --git a/asv_bench/benchmarks/index_cached_properties.py b/asv_bench/benchmarks/index_cached_properties.py index 16fbc741775e4..1a88bb7eef37a 100644 --- a/asv_bench/benchmarks/index_cached_properties.py +++ b/asv_bench/benchmarks/index_cached_properties.py @@ -22,7 +22,7 @@ class IndexCache: param_names = ["index_type"] def setup(self, index_type): - N = 10 ** 5 + N = 10**5 if index_type == "MultiIndex": self.idx = pd.MultiIndex.from_product( [pd.date_range("1/1/2000", freq="T", periods=N // 2), ["a", "b"]] @@ -56,9 +56,6 @@ def time_values(self, index_type): def time_shape(self, index_type): self.idx.shape - def time_is_monotonic(self, index_type): - self.idx.is_monotonic - def time_is_monotonic_decreasing(self, index_type): self.idx.is_monotonic_decreasing diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 9c05019c70396..dab33f02c2cd9 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -25,7 +25,7 @@ class SetOperations: param_names = ["dtype", "method"] def setup(self, dtype, method): - N = 10 ** 5 + N = 10**5 dates_left = date_range("1/1/2000", periods=N, freq="T") fmt = "%Y-%m-%d %H:%M:%S" date_str_left = Index(dates_left.strftime(fmt)) @@ -46,7 +46,7 @@ def time_operation(self, dtype, method): class SetDisjoint: def setup(self): - N = 10 ** 5 + N = 10**5 B = N + 20000 self.datetime_left = DatetimeIndex(range(N)) self.datetime_right = DatetimeIndex(range(N, B)) @@ -57,8 +57,8 @@ def time_datetime_difference_disjoint(self): class Range: def setup(self): - self.idx_inc = RangeIndex(start=0, stop=10 ** 6, step=3) - self.idx_dec = RangeIndex(start=10 ** 6, stop=-1, step=-3) + self.idx_inc = RangeIndex(start=0, stop=10**6, step=3) + self.idx_dec = RangeIndex(start=10**6, stop=-1, step=-3) def time_max(self): self.idx_inc.max() @@ -86,6 +86,12 @@ def time_iter_dec(self): for _ in self.idx_dec: pass + def time_sort_values_asc(self): + self.idx_inc.sort_values() + + def time_sort_values_des(self): + self.idx_inc.sort_values(ascending=False) + class IndexEquals: def setup(self): @@ -133,7 +139,7 @@ class Indexing: param_names = ["dtype"] def setup(self, dtype): - N = 10 ** 6 + N = 10**6 self.idx = getattr(tm, f"make{dtype}Index")(N) self.array_mask = (np.arange(N) % 3) == 0 self.series_mask = Series(self.array_mask) @@ -186,7 +192,7 @@ def time_get_loc(self): class IntervalIndexMethod: # GH 24813 - params = [10 ** 3, 10 ** 5] + params = [10**3, 10**5] def setup(self, N): left = np.append(np.arange(N), np.array(0)) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 10fb926ee4d03..54da7c109e02a 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -13,7 +13,6 @@ CategoricalIndex, DataFrame, Float64Index, - IndexSlice, Int64Index, IntervalIndex, MultiIndex, @@ -37,7 +36,7 @@ class NumericSeriesIndexing: param_names = ["index_dtype", "index_structure"] def setup(self, index, index_structure): - N = 10 ** 6 + N = 10**6 indices = { "unique_monotonic_inc": index(range(N)), "nonunique_monotonic_inc": index( @@ -97,7 +96,7 @@ class NonNumericSeriesIndexing: param_names = ["index_dtype", "index_structure"] def setup(self, index, index_structure): - N = 10 ** 6 + N = 10**6 if index == "string": index = tm.makeStringIndex(N) elif index == "datetime": @@ -144,6 +143,12 @@ def setup(self): def time_loc(self): self.df.loc[self.idx_scalar, self.col_scalar] + def time_at(self): + self.df.at[self.idx_scalar, self.col_scalar] + + def time_at_setitem(self): + self.df.at[self.idx_scalar, self.col_scalar] = 0.0 + def time_getitem_scalar(self): self.df[self.col_scalar][self.idx_scalar] @@ -158,25 +163,39 @@ def time_boolean_rows_boolean(self): class DataFrameNumericIndexing: - def setup(self): + + params = [ + (Int64Index, UInt64Index, Float64Index), + ("unique_monotonic_inc", "nonunique_monotonic_inc"), + ] + param_names = ["index_dtype", "index_structure"] + + def setup(self, index, index_structure): + N = 10**5 + indices = { + "unique_monotonic_inc": index(range(N)), + "nonunique_monotonic_inc": index( + list(range(55)) + [54] + list(range(55, N - 1)) + ), + } self.idx_dupe = np.array(range(30)) * 99 - self.df = DataFrame(np.random.randn(100000, 5)) + self.df = DataFrame(np.random.randn(N, 5), index=indices[index_structure]) self.df_dup = concat([self.df, 2 * self.df, 3 * self.df]) - self.bool_indexer = [True] * 50000 + [False] * 50000 + self.bool_indexer = [True] * (N // 2) + [False] * (N - N // 2) - def time_iloc_dups(self): + def time_iloc_dups(self, index, index_structure): self.df_dup.iloc[self.idx_dupe] - def time_loc_dups(self): + def time_loc_dups(self, index, index_structure): self.df_dup.loc[self.idx_dupe] - def time_iloc(self): + def time_iloc(self, index, index_structure): self.df.iloc[:100, 0] - def time_loc(self): + def time_loc(self, index, index_structure): self.df.loc[:100, 0] - def time_bool_indexer(self): + def time_bool_indexer(self, index, index_structure): self.df[self.bool_indexer] @@ -200,28 +219,81 @@ def time_take(self, index): class MultiIndexing: - def setup(self): - mi = MultiIndex.from_product([range(1000), range(1000)]) - self.s = Series(np.random.randn(1000000), index=mi) - self.df = DataFrame(self.s) - n = 100000 - with warnings.catch_warnings(record=True): - self.mdt = DataFrame( - { - "A": np.random.choice(range(10000, 45000, 1000), n), - "B": np.random.choice(range(10, 400), n), - "C": np.random.choice(range(1, 150), n), - "D": np.random.choice(range(10000, 45000), n), - "x": np.random.choice(range(400), n), - "y": np.random.choice(range(25), n), - } - ) - self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000] - self.mdt = self.mdt.set_index(["A", "B", "C", "D"]).sort_index() + params = [True, False] + param_names = ["unique_levels"] + + def setup(self, unique_levels): + self.nlevels = 2 + if unique_levels: + mi = MultiIndex.from_arrays([range(1000000)] * self.nlevels) + else: + mi = MultiIndex.from_product([range(1000)] * self.nlevels) + self.df = DataFrame(np.random.randn(len(mi)), index=mi) + + self.tgt_slice = slice(200, 800) + self.tgt_null_slice = slice(None) + self.tgt_list = list(range(0, 1000, 10)) + self.tgt_scalar = 500 + + bool_indexer = np.zeros(len(mi), dtype=np.bool_) + bool_indexer[slice(0, len(mi), 100)] = True + self.tgt_bool_indexer = bool_indexer - def time_index_slice(self): - self.mdt.loc[self.idx, :] + def time_loc_partial_key_slice(self, unique_levels): + self.df.loc[self.tgt_slice, :] + + def time_loc_partial_key_null_slice(self, unique_levels): + self.df.loc[self.tgt_null_slice, :] + + def time_loc_partial_key_list(self, unique_levels): + self.df.loc[self.tgt_list, :] + + def time_loc_partial_key_scalar(self, unique_levels): + self.df.loc[self.tgt_scalar, :] + + def time_loc_partial_key_bool_indexer(self, unique_levels): + self.df.loc[self.tgt_bool_indexer, :] + + def time_loc_all_slices(self, unique_levels): + target = tuple([self.tgt_slice] * self.nlevels) + self.df.loc[target, :] + + def time_loc_all_null_slices(self, unique_levels): + target = tuple([self.tgt_null_slice] * self.nlevels) + self.df.loc[target, :] + + def time_loc_all_lists(self, unique_levels): + target = tuple([self.tgt_list] * self.nlevels) + self.df.loc[target, :] + + def time_loc_all_scalars(self, unique_levels): + target = tuple([self.tgt_scalar] * self.nlevels) + self.df.loc[target, :] + + def time_loc_all_bool_indexers(self, unique_levels): + target = tuple([self.tgt_bool_indexer] * self.nlevels) + self.df.loc[target, :] + + def time_loc_slice_plus_null_slice(self, unique_levels): + target = (self.tgt_slice, self.tgt_null_slice) + self.df.loc[target, :] + + def time_loc_null_slice_plus_slice(self, unique_levels): + target = (self.tgt_null_slice, self.tgt_slice) + self.df.loc[target, :] + + def time_xs_level_0(self, unique_levels): + target = self.tgt_scalar + self.df.xs(target, level=0) + + def time_xs_level_1(self, unique_levels): + target = self.tgt_scalar + self.df.xs(target, level=1) + + def time_xs_full_key(self, unique_levels): + target = tuple([self.tgt_scalar] * self.nlevels) + self.df.xs(target) class IntervalIndexing: @@ -257,13 +329,31 @@ def time_get_indexer_mismatched_tz(self): self.dti.get_indexer(self.dti2) +class SortedAndUnsortedDatetimeIndexLoc: + def setup(self): + dti = date_range("2016-01-01", periods=10000, tz="US/Pacific") + index = np.array(dti) + + unsorted_index = index.copy() + unsorted_index[10] = unsorted_index[20] + + self.df_unsorted = DataFrame(index=unsorted_index, data={"a": 1}) + self.df_sort = DataFrame(index=index, data={"a": 1}) + + def time_loc_unsorted(self): + self.df_unsorted.loc["2016-6-11"] + + def time_loc_sorted(self): + self.df_sort.loc["2016-6-11"] + + class CategoricalIndexIndexing: params = ["monotonic_incr", "monotonic_decr", "non_monotonic"] param_names = ["index"] def setup(self, index): - N = 10 ** 5 + N = 10**5 values = list("a" * N + "b" * N + "c" * N) indices = { "monotonic_incr": CategoricalIndex(values), @@ -332,7 +422,7 @@ class IndexSingleRow: param_names = ["unique_cols"] def setup(self, unique_cols): - arr = np.arange(10 ** 7).reshape(-1, 10) + arr = np.arange(10**7).reshape(-1, 10) df = DataFrame(arr) dtypes = ["u1", "u2", "u4", "u8", "i1", "i2", "i4", "i8", "f8", "f4"] for i, d in enumerate(dtypes): @@ -364,13 +454,22 @@ def time_frame_assign_timeseries_index(self): class InsertColumns: def setup(self): - self.N = 10 ** 3 + self.N = 10**3 self.df = DataFrame(index=range(self.N)) + self.df2 = DataFrame(np.random.randn(self.N, 2)) def time_insert(self): for i in range(100): self.df.insert(0, i, np.random.randn(self.N), allow_duplicates=True) + def time_insert_middle(self): + # same as time_insert but inserting to a middle column rather than + # front or back (which have fast-paths) + for i in range(100): + self.df2.insert( + 1, "colname", np.random.randn(self.N), allow_duplicates=True + ) + def time_assign_with_setitem(self): for i in range(100): self.df[i] = np.random.randn(self.N) @@ -390,12 +489,14 @@ class ChainIndexing: def setup(self, mode): self.N = 1000000 + self.df = DataFrame({"A": np.arange(self.N), "B": "foo"}) def time_chained_indexing(self, mode): + df = self.df + N = self.N with warnings.catch_warnings(record=True): with option_context("mode.chained_assignment", mode): - df = DataFrame({"A": np.arange(self.N), "B": "foo"}) - df2 = df[df.A > self.N // 2] + df2 = df[df.A > N // 2] df2["C"] = 1.0 diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 30ef7f63dc0dc..0c6cb89f49da1 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -1,5 +1,5 @@ """ -Benchmarks in this fiel depend exclusively on code in _libs/ +Benchmarks in this file depend exclusively on code in _libs/ If a PR does not edit anything in _libs, it is very unlikely that benchmarks in this file will be affected. @@ -35,25 +35,49 @@ class NumericEngineIndexing: params = [ _get_numeric_engines(), ["monotonic_incr", "monotonic_decr", "non_monotonic"], + [True, False], + [10**5, 2 * 10**6], # 2e6 is above SIZE_CUTOFF ] - param_names = ["engine_and_dtype", "index_type"] + param_names = ["engine_and_dtype", "index_type", "unique", "N"] - def setup(self, engine_and_dtype, index_type): + def setup(self, engine_and_dtype, index_type, unique, N): engine, dtype = engine_and_dtype - N = 10 ** 5 - values = list([1] * N + [2] * N + [3] * N) - arr = { - "monotonic_incr": np.array(values, dtype=dtype), - "monotonic_decr": np.array(list(reversed(values)), dtype=dtype), - "non_monotonic": np.array([1, 2, 3] * N, dtype=dtype), - }[index_type] - self.data = engine(lambda: arr, len(arr)) + if index_type == "monotonic_incr": + if unique: + arr = np.arange(N * 3, dtype=dtype) + else: + values = list([1] * N + [2] * N + [3] * N) + arr = np.array(values, dtype=dtype) + elif index_type == "monotonic_decr": + if unique: + arr = np.arange(N * 3, dtype=dtype)[::-1] + else: + values = list([1] * N + [2] * N + [3] * N) + arr = np.array(values, dtype=dtype)[::-1] + else: + assert index_type == "non_monotonic" + if unique: + arr = np.empty(N * 3, dtype=dtype) + arr[:N] = np.arange(N * 2, N * 3, dtype=dtype) + arr[N:] = np.arange(N * 2, dtype=dtype) + else: + arr = np.array([1, 2, 3] * N, dtype=dtype) + + self.data = engine(arr) # code belows avoids populating the mapping etc. while timing. self.data.get_loc(2) - def time_get_loc(self, engine_and_dtype, index_type): - self.data.get_loc(2) + self.key_middle = arr[len(arr) // 2] + self.key_early = arr[2] + + def time_get_loc(self, engine_and_dtype, index_type, unique, N): + self.data.get_loc(self.key_early) + + def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N): + # searchsorted performance may be different near the middle of a range + # vs near an endpoint + self.data.get_loc(self.key_middle) class ObjectEngineIndexing: @@ -62,7 +86,7 @@ class ObjectEngineIndexing: param_names = ["index_type"] def setup(self, index_type): - N = 10 ** 5 + N = 10**5 values = list("a" * N + "b" * N + "c" * N) arr = { "monotonic_incr": np.array(values, dtype=object), @@ -70,7 +94,7 @@ def setup(self, index_type): "non_monotonic": np.array(list("abc") * N, dtype=object), }[index_type] - self.data = libindex.ObjectEngine(lambda: arr, len(arr)) + self.data = libindex.ObjectEngine(arr) # code belows avoids populating the mapping etc. while timing. self.data.get_loc("b") diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 0aa924dabd469..0bbb599f2b045 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -85,8 +85,8 @@ class MaybeConvertNumeric: # go in benchmarks/libs.py def setup_cache(self): - N = 10 ** 6 - arr = np.repeat([2 ** 63], N) + np.arange(N).astype("uint64") + N = 10**6 + arr = np.repeat([2**63], N) + np.arange(N).astype("uint64") data = arr.astype(object) data[1::2] = arr[1::2].astype(str) data[-1] = -1 @@ -101,7 +101,7 @@ class MaybeConvertObjects: # does have some run-time imports from outside of _libs def setup(self): - N = 10 ** 5 + N = 10**5 data = list(range(N)) data[0] = NaT @@ -115,19 +115,27 @@ def time_maybe_convert_objects(self): class ToDatetimeFromIntsFloats: def setup(self): self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64") + self.ts_sec_uint = Series(range(1521080307, 1521685107), dtype="uint64") self.ts_sec_float = self.ts_sec.astype("float64") self.ts_nanosec = 1_000_000 * self.ts_sec + self.ts_nanosec_uint = 1_000_000 * self.ts_sec_uint self.ts_nanosec_float = self.ts_nanosec.astype("float64") - # speed of int64 and float64 paths should be comparable + # speed of int64, uint64 and float64 paths should be comparable def time_nanosec_int64(self): to_datetime(self.ts_nanosec, unit="ns") + def time_nanosec_uint64(self): + to_datetime(self.ts_nanosec_uint, unit="ns") + def time_nanosec_float64(self): to_datetime(self.ts_nanosec_float, unit="ns") + def time_sec_uint64(self): + to_datetime(self.ts_sec_uint, unit="s") + def time_sec_int64(self): to_datetime(self.ts_sec, unit="s") @@ -165,6 +173,7 @@ def setup(self): self.strings_tz_space = [ x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng ] + self.strings_zero_tz = [x.strftime("%Y-%m-%d %H:%M:%S") + "Z" for x in rng] def time_iso8601(self): to_datetime(self.strings) @@ -181,6 +190,10 @@ def time_iso8601_format_no_sep(self): def time_iso8601_tz_spaceformat(self): to_datetime(self.strings_tz_space) + def time_iso8601_infer_zero_tz_fromat(self): + # GH 41047 + to_datetime(self.strings_zero_tz, infer_datetime_format=True) + class ToDatetimeNONISO8601: def setup(self): @@ -264,6 +277,16 @@ def time_dup_string_tzoffset_dates(self, cache): to_datetime(self.dup_string_with_tz, cache=cache) +# GH 43901 +class ToDatetimeInferDatetimeFormat: + def setup(self): + rng = date_range(start="1/1/2000", periods=100000, freq="H") + self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() + + def time_infer_datetime_format(self): + to_datetime(self.strings, infer_datetime_format=True) + + class ToTimedelta: def setup(self): self.ints = np.random.randint(0, 60, size=10000) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 5ff9431fbf8e4..10aef954a3475 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -10,6 +10,7 @@ from pandas import ( Categorical, DataFrame, + concat, date_range, read_csv, to_datetime, @@ -54,6 +55,26 @@ def time_frame(self, kind): self.df.to_csv(self.fname) +class ToCSVMultiIndexUnusedLevels(BaseIO): + + fname = "__test__.csv" + + def setup(self): + df = DataFrame({"a": np.random.randn(100_000), "b": 1, "c": 1}) + self.df = df.set_index(["a", "b"]) + self.df_unused_levels = self.df.iloc[:10_000] + self.df_single_index = df.set_index(["a"]).iloc[:10_000] + + def time_full_frame(self): + self.df.to_csv(self.fname) + + def time_sliced_frame(self): + self.df_unused_levels.to_csv(self.fname) + + def time_single_index_frame(self): + self.df_single_index.to_csv(self.fname) + + class ToCSVDatetime(BaseIO): fname = "__test__.csv" @@ -66,6 +87,21 @@ def time_frame_date_formatting(self): self.data.to_csv(self.fname, date_format="%Y%m%d") +class ToCSVDatetimeIndex(BaseIO): + + fname = "__test__.csv" + + def setup(self): + rng = date_range("2000", periods=100_000, freq="S") + self.data = DataFrame({"a": 1}, index=rng) + + def time_frame_date_formatting_index(self): + self.data.to_csv(self.fname, date_format="%Y-%m-%d %H:%M:%S") + + def time_frame_date_no_format_index(self): + self.data.to_csv(self.fname) + + class ToCSVDatetimeBig(BaseIO): fname = "__test__.csv" @@ -206,7 +242,7 @@ def time_read_csv(self, bad_date_value): class ReadCSVSkipRows(BaseIO): fname = "__test__.csv" - params = ([None, 10000], ["c", "python"]) + params = ([None, 10000], ["c", "python", "pyarrow"]) param_names = ["skiprows", "engine"] def setup(self, skiprows, engine): @@ -230,8 +266,8 @@ def time_skipprows(self, skiprows, engine): class ReadUint64Integers(StringIORewind): def setup(self): - self.na_values = [2 ** 63 + 500] - arr = np.arange(10000).astype("uint64") + 2 ** 63 + self.na_values = [2**63 + 500] + arr = np.arange(10000).astype("uint64") + 2**63 self.data1 = StringIO("\n".join(arr.astype(str).tolist())) arr = arr.astype(object) arr[500] = -1 @@ -291,7 +327,8 @@ class ReadCSVFloatPrecision(StringIORewind): def setup(self, sep, decimal, float_precision): floats = [ - "".join(random.choice(string.digits) for _ in range(28)) for _ in range(15) + "".join([random.choice(string.digits) for _ in range(28)]) + for _ in range(15) ] rows = sep.join([f"0{decimal}" + "{}"] * 3) + "\n" data = rows * 5 @@ -319,7 +356,7 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): class ReadCSVEngine(StringIORewind): - params = ["c", "python"] + params = ["c", "python", "pyarrow"] param_names = ["engine"] def setup(self, engine): @@ -395,7 +432,7 @@ class ReadCSVCachedParseDates(StringIORewind): param_names = ["do_cache", "engine"] def setup(self, do_cache, engine): - data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10 + data = ("\n".join([f"10/{year}" for year in range(2000, 2100)]) + "\n") * 10 self.StringIO_input = StringIO(data) def time_read_csv_cached(self, do_cache, engine): @@ -458,6 +495,34 @@ def time_read_special_date(self, value, engine): ) +class ReadCSVMemMapUTF8: + + fname = "__test__.csv" + number = 5 + + def setup(self): + lines = [] + line_length = 128 + start_char = " " + end_char = "\U00010080" + # This for loop creates a list of 128-char strings + # consisting of consecutive Unicode chars + for lnum in range(ord(start_char), ord(end_char), line_length): + line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n" + try: + line.encode("utf-8") + except UnicodeEncodeError: + # Some 16-bit words are not valid Unicode chars and must be skipped + continue + lines.append(line) + df = DataFrame(lines) + df = concat([df for n in range(100)], ignore_index=True) + df.to_csv(self.fname, index=False, header=False, encoding="utf-8") + + def time_read_memmapped_utf8(self): + read_csv(self.fname, header=None, memory_map=True, encoding="utf-8", engine="c") + + class ParseDateComparison(StringIORewind): params = ([False, True],) param_names = ["cache_dates"] @@ -495,4 +560,14 @@ def time_to_datetime_format_DD_MM_YYYY(self, cache_dates): to_datetime(df["date"], cache=cache_dates, format="%d-%m-%Y") +class ReadCSVIndexCol(StringIORewind): + def setup(self): + count_elem = 100_000 + data = "a,b\n" + "1,2\n" * count_elem + self.StringIO_input = StringIO(data) + + def time_read_csv_index_col(self): + read_csv(self.StringIO_input, index_col="a") + + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 3363b43f29b78..a88c4374b7030 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -47,6 +47,25 @@ def time_write_excel(self, engine): writer.save() +class WriteExcelStyled: + params = ["openpyxl", "xlsxwriter"] + param_names = ["engine"] + + def setup(self, engine): + self.df = _generate_dataframe() + + def time_write_excel_style(self, engine): + bio = BytesIO() + bio.seek(0) + writer = ExcelWriter(bio, engine=engine) + df_style = self.df.style + df_style.applymap(lambda x: "border: red 1px solid;") + df_style.applymap(lambda x: "color: blue") + df_style.applymap(lambda x: "border-color: green black", subset=["float1"]) + df_style.to_excel(writer, sheet_name="Sheet1") + writer.save() + + class ReadExcel: params = ["xlrd", "openpyxl", "odf"] @@ -86,4 +105,15 @@ def time_read_excel(self, engine): read_excel(fname, engine=engine) +class ReadExcelNRows(ReadExcel): + def time_read_excel(self, engine): + if engine == "xlrd": + fname = self.fname_excel_xls + elif engine == "odf": + fname = self.fname_odf + else: + fname = self.fname_excel + read_excel(fname, engine=engine, nrows=10) + + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index d9d27ce7e5d8c..bb09fe0ff634d 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -109,7 +109,7 @@ class ToJSON(BaseIO): param_names = ["orient", "frame"] def setup(self, orient, frame): - N = 10 ** 5 + N = 10**5 ncols = 5 index = date_range("20000101", periods=N, freq="H") timedeltas = timedelta_range(start=1, periods=N, freq="s") @@ -172,15 +172,19 @@ def time_to_json(self, orient, frame): def peakmem_to_json(self, orient, frame): getattr(self, frame).to_json(self.fname, orient=orient) - def time_to_json_wide(self, orient, frame): + +class ToJSONWide(ToJSON): + def setup(self, orient, frame): + super().setup(orient, frame) base_df = getattr(self, frame).copy() - df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1) - df.to_json(self.fname, orient=orient) + df_wide = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1) + self.df_wide = df_wide + + def time_to_json_wide(self, orient, frame): + self.df_wide.to_json(self.fname, orient=orient) def peakmem_to_json_wide(self, orient, frame): - base_df = getattr(self, frame).copy() - df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1) - df.to_json(self.fname, orient=orient) + self.df_wide.to_json(self.fname, orient=orient) class ToJSONISO(BaseIO): @@ -189,7 +193,7 @@ class ToJSONISO(BaseIO): param_names = ["orient"] def setup(self, orient): - N = 10 ** 5 + N = 10**5 index = date_range("20000101", periods=N, freq="H") timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") @@ -212,7 +216,7 @@ class ToJSONLines(BaseIO): fname = "__test__.json" def setup(self): - N = 10 ** 5 + N = 10**5 ncols = 5 index = date_range("20000101", periods=N, freq="H") timedeltas = timedelta_range(start=1, periods=N, freq="s") diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index 3cfa28de78c90..fb8b7dafa0ade 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -39,6 +39,8 @@ def setup(self, connection): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") @@ -53,7 +55,16 @@ class WriteSQLDtypes: params = ( ["sqlalchemy", "sqlite"], - ["float", "float_with_nan", "string", "bool", "int", "datetime"], + [ + "float", + "float_with_nan", + "string", + "bool", + "int", + "date", + "time", + "datetime", + ], ) param_names = ["connection", "dtype"] @@ -78,6 +89,8 @@ def setup(self, connection, dtype): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") @@ -105,6 +118,8 @@ def setup(self): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") @@ -122,7 +137,16 @@ def time_read_sql_table_parse_dates(self): class ReadSQLTableDtypes: - params = ["float", "float_with_nan", "string", "bool", "int", "datetime"] + params = [ + "float", + "float_with_nan", + "string", + "bool", + "int", + "date", + "time", + "datetime", + ] param_names = ["dtype"] def setup(self, dtype): @@ -141,6 +165,8 @@ def setup(self, dtype): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py index 82166a2a95c76..f0902c9c2c328 100644 --- a/asv_bench/benchmarks/io/style.py +++ b/asv_bench/benchmarks/io/style.py @@ -34,13 +34,29 @@ def peakmem_classes_render(self, cols, rows): self._style_classes() self.st._render_html(True, True) + def time_tooltips_render(self, cols, rows): + self._style_tooltips() + self.st._render_html(True, True) + + def peakmem_tooltips_render(self, cols, rows): + self._style_tooltips() + self.st._render_html(True, True) + def time_format_render(self, cols, rows): self._style_format() - self.st.render() + self.st._render_html(True, True) def peakmem_format_render(self, cols, rows): self._style_format() - self.st.render() + self.st._render_html(True, True) + + def time_apply_format_hide_render(self, cols, rows): + self._style_apply_format_hide() + self.st._render_html(True, True) + + def peakmem_apply_format_hide_render(self, cols, rows): + self._style_apply_format_hide() + self.st._render_html(True, True) def _style_apply(self): def _apply_func(s): @@ -63,3 +79,15 @@ def _style_format(self): self.st = self.df.style.format( "{:,.3f}", subset=IndexSlice["row_1":f"row_{ir}", "float_1":f"float_{ic}"] ) + + def _style_apply_format_hide(self): + self.st = self.df.style.applymap(lambda v: "color: red;") + self.st.format("{:.3f}") + self.st.hide_index(self.st.index[1:]) + self.st.hide_columns(self.st.columns[1:]) + + def _style_tooltips(self): + ttips = DataFrame("abc", index=self.df.index[::2], columns=self.df.columns[::2]) + self.st = self.df.style.set_tooltips(ttips) + self.st.hide_index(self.st.index[12:]) + self.st.hide_columns(self.st.columns[12:]) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 27eaecff09d0f..e3c6bf9bd4e07 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -158,6 +158,19 @@ def time_left_outer_join_index(self): self.left.join(self.right, on="jim") +class JoinEmpty: + def setup(self): + N = 100_000 + self.df = DataFrame({"A": np.arange(N)}) + self.df_empty = DataFrame(columns=["B", "C"], dtype="int64") + + def time_inner_join_left_empty(self): + self.df_empty.join(self.df, how="inner") + + def time_inner_join_right_empty(self): + self.df.join(self.df_empty, how="inner") + + class JoinNonUnique: # outer join of non-unique # GH 6329 @@ -216,6 +229,12 @@ def time_merge_dataframe_integer_2key(self, sort): def time_merge_dataframe_integer_key(self, sort): merge(self.df, self.df2, on="key1", sort=sort) + def time_merge_dataframe_empty_right(self, sort): + merge(self.left, self.right.iloc[:0], sort=sort) + + def time_merge_dataframe_empty_left(self, sort): + merge(self.left.iloc[:0], self.right, sort=sort) + def time_merge_dataframes_cross(self, sort): merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort) @@ -226,7 +245,7 @@ class I8Merge: param_names = ["how"] def setup(self, how): - low, high, n = -1000, 1000, 10 ** 6 + low, high, n = -1000, 1000, 10**6 self.left = DataFrame( np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG") ) @@ -262,12 +281,24 @@ def setup(self): Z=self.right_object["Z"].astype("category") ) + self.left_cat_col = self.left_object.astype({"X": "category"}) + self.right_cat_col = self.right_object.astype({"X": "category"}) + + self.left_cat_idx = self.left_cat_col.set_index("X") + self.right_cat_idx = self.right_cat_col.set_index("X") + def time_merge_object(self): merge(self.left_object, self.right_object, on="X") def time_merge_cat(self): merge(self.left_cat, self.right_cat, on="X") + def time_merge_on_cat_col(self): + merge(self.left_cat_col, self.right_cat_col, on="X") + + def time_merge_on_cat_idx(self): + merge(self.left_cat_idx, self.right_cat_idx, on="X") + class MergeOrdered: def setup(self): @@ -382,8 +413,8 @@ def time_multiby(self, direction, tolerance): class Align: def setup(self): - size = 5 * 10 ** 5 - rng = np.arange(0, 10 ** 13, 10 ** 7) + size = 5 * 10**5 + rng = np.arange(0, 10**13, 10**7) stamps = np.datetime64("now").view("i8") + rng idx1 = np.sort(np.random.choice(stamps, size, replace=False)) idx2 = np.sort(np.random.choice(stamps, size, replace=False)) diff --git a/asv_bench/benchmarks/libs.py b/asv_bench/benchmarks/libs.py index 4e3f938a33eb1..f041499c9c622 100644 --- a/asv_bench/benchmarks/libs.py +++ b/asv_bench/benchmarks/libs.py @@ -2,7 +2,7 @@ Benchmarks for code in pandas/_libs, excluding pandas/_libs/tslibs, which has its own directory. -If a PR does not edit anything in _libs/, then it is unlikely that thes +If a PR does not edit anything in _libs/, then it is unlikely that the benchmarks will be affected. """ import numpy as np diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 25df5b0214959..a498c6b2e4944 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -47,6 +47,29 @@ def time_small_get_loc_warm(self): self.mi_small.get_loc((99, "A", "A")) +class GetLocs: + def setup(self): + self.mi_large = MultiIndex.from_product( + [np.arange(1000), np.arange(20), list(string.ascii_letters)], + names=["one", "two", "three"], + ) + self.mi_med = MultiIndex.from_product( + [np.arange(1000), np.arange(10), list("A")], names=["one", "two", "three"] + ) + self.mi_small = MultiIndex.from_product( + [np.arange(100), list("A"), list("A")], names=["one", "two", "three"] + ) + + def time_large_get_locs(self): + self.mi_large.get_locs([999, 19, "Z"]) + + def time_med_get_locs(self): + self.mi_med.get_locs([999, 9, "A"]) + + def time_small_get_locs(self): + self.mi_small.get_locs([99, "A", "A"]) + + class Duplicates: def setup(self): size = 65536 @@ -112,7 +135,7 @@ def time_get_indexer_and_pad(self): self.mi_int.get_indexer(self.other_mi_many_mismatches, method="pad") def time_is_monotonic(self): - self.mi_int.is_monotonic + self.mi_int.is_monotonic_increasing class Duplicated: @@ -203,7 +226,7 @@ class SetOperations: param_names = ["index_structure", "dtype", "method"] def setup(self, index_structure, dtype, method): - N = 10 ** 5 + N = 10**5 level1 = range(1000) level2 = date_range(start="1/1/2000", periods=N // 1000) diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index ed44102700dc6..d3168bde0a783 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -17,7 +17,7 @@ try: import pandas._testing as tm except ImportError: - import pandas.util.testing as tm # noqa + import pandas.util.testing as tm # noqa:F401 numeric_dtypes = [ diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index 249a8f3f556a1..789bb8d8533b1 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -1,9 +1,14 @@ -import importlib +import contextlib +import importlib.machinery +import importlib.util +import os +import pathlib import sys +import tempfile +from unittest import mock import matplotlib import numpy as np -import pkg_resources from pandas import ( DataFrame, @@ -111,22 +116,49 @@ class BackendLoading: warmup_time = 0 def setup(self): - dist = pkg_resources.get_distribution("pandas") - spec = importlib.machinery.ModuleSpec("my_backend", None) - mod = importlib.util.module_from_spec(spec) + mod = importlib.util.module_from_spec( + importlib.machinery.ModuleSpec("pandas_dummy_backend", None) + ) mod.plot = lambda *args, **kwargs: 1 - backends = pkg_resources.get_entry_map("pandas") - my_entrypoint = pkg_resources.EntryPoint( - "pandas_plotting_backend", mod.__name__, dist=dist - ) - backends["pandas_plotting_backends"][mod.__name__] = my_entrypoint - for i in range(10): - backends["pandas_plotting_backends"][str(i)] = my_entrypoint - sys.modules["my_backend"] = mod + with contextlib.ExitStack() as stack: + stack.enter_context( + mock.patch.dict(sys.modules, {"pandas_dummy_backend": mod}) + ) + tmp_path = pathlib.Path(stack.enter_context(tempfile.TemporaryDirectory())) + + sys.path.insert(0, os.fsdecode(tmp_path)) + stack.callback(sys.path.remove, os.fsdecode(tmp_path)) + + dist_info = tmp_path / "my_backend-0.0.0.dist-info" + dist_info.mkdir() + (dist_info / "entry_points.txt").write_bytes( + b"[pandas_plotting_backends]\n" + b"my_ep_backend = pandas_dummy_backend\n" + b"my_ep_backend0 = pandas_dummy_backend\n" + b"my_ep_backend1 = pandas_dummy_backend\n" + b"my_ep_backend2 = pandas_dummy_backend\n" + b"my_ep_backend3 = pandas_dummy_backend\n" + b"my_ep_backend4 = pandas_dummy_backend\n" + b"my_ep_backend5 = pandas_dummy_backend\n" + b"my_ep_backend6 = pandas_dummy_backend\n" + b"my_ep_backend7 = pandas_dummy_backend\n" + b"my_ep_backend8 = pandas_dummy_backend\n" + b"my_ep_backend9 = pandas_dummy_backend\n" + ) + self.stack = stack.pop_all() + + def teardown(self): + self.stack.close() def time_get_plot_backend(self): - _get_plot_backend("my_backend") + # finds the first my_ep_backend + _get_plot_backend("my_ep_backend") + + def time_get_plot_backend_fallback(self): + # iterates through all the my_ep_backend[0-9] before falling back + # to importlib.import_module + _get_plot_backend("pandas_dummy_backend") from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 5181b983c9f7a..29d2831be1522 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -28,6 +28,11 @@ def setup(self): index = MultiIndex.from_arrays([level1, level2]) self.s = Series(np.random.randn(N * K), index=index) self.s_subset = self.s[::2] + self.s_subset_no_cache = self.s[::2].copy() + + mi = MultiIndex.from_product([rng, range(100)]) + self.s2 = Series(np.random.randn(len(mi)), index=mi) + self.s2_subset = self.s2[::2].copy() def time_reindex_dates(self): self.df.reindex(self.rng_subset) @@ -35,9 +40,18 @@ def time_reindex_dates(self): def time_reindex_columns(self): self.df2.reindex(columns=self.df.columns[1:5]) - def time_reindex_multiindex(self): + def time_reindex_multiindex_with_cache(self): + # MultiIndex._values gets cached self.s.reindex(self.s_subset.index) + def time_reindex_multiindex_no_cache(self): + # Copy to avoid MultiIndex._values getting cached + self.s.reindex(self.s_subset_no_cache.index.copy()) + + def time_reindex_multiindex_no_cache_dates(self): + # Copy to avoid MultiIndex._values getting cached + self.s2_subset.reindex(self.s2.index.copy()) + class ReindexMethod: diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 2a115fb0b4fe3..8d4fc0240f2cc 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -9,7 +9,7 @@ class FillNa: param_names = ["inplace"] def setup(self, inplace): - N = 10 ** 6 + N = 10**6 rng = pd.date_range("1/1/2000", periods=N, freq="min") data = np.random.randn(N) data[::2] = np.nan @@ -28,10 +28,10 @@ class ReplaceDict: param_names = ["inplace"] def setup(self, inplace): - N = 10 ** 5 - start_value = 10 ** 5 + N = 10**5 + start_value = 10**5 self.to_rep = dict(enumerate(np.arange(N) + start_value)) - self.s = pd.Series(np.random.randint(N, size=10 ** 3)) + self.s = pd.Series(np.random.randint(N, size=10**3)) def time_replace_series(self, inplace): self.s.replace(self.to_rep, inplace=inplace) @@ -44,13 +44,13 @@ class ReplaceList: param_names = ["inplace"] def setup(self, inplace): - self.df = pd.DataFrame({"A": 0, "B": 0}, index=range(4 * 10 ** 7)) + self.df = pd.DataFrame({"A": 0, "B": 0}, index=range(4 * 10**7)) def time_replace_list(self, inplace): self.df.replace([np.inf, -np.inf], np.nan, inplace=inplace) def time_replace_list_one_match(self, inplace): - # the 1 can be held in self._df.blocks[0], while the inf and -inf cant + # the 1 can be held in self._df.blocks[0], while the inf and -inf can't self.df.replace([np.inf, -np.inf, 1], np.nan, inplace=inplace) @@ -60,7 +60,7 @@ class Convert: param_names = ["constructor", "replace_data"] def setup(self, constructor, replace_data): - N = 10 ** 3 + N = 10**3 data = { "Series": pd.Series(np.random.randint(N, size=N)), "DataFrame": pd.DataFrame( diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 232aabfb87c58..05e12630d7540 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -78,7 +78,7 @@ def time_stack(self, dtype): self.df.stack() def time_unstack_fast(self, dtype): - # last level -> doesnt have to make copies + # last level -> doesn't have to make copies self.ser.unstack("bar") def time_unstack_slow(self, dtype): @@ -102,6 +102,7 @@ def setup(self, dtype): columns = np.arange(n) if dtype == "int": values = np.arange(m * m * n).reshape(m * m, n) + self.df = DataFrame(values, index, columns) else: # the category branch is ~20x slower than int. So we # cut down the size a bit. Now it's only ~3x slower. @@ -111,7 +112,10 @@ def setup(self, dtype): values = np.take(list(string.ascii_letters), indices) values = [pd.Categorical(v) for v in values.T] - self.df = DataFrame(values, index, columns) + self.df = DataFrame( + {i: cat for i, cat in enumerate(values)}, index, columns + ) + self.df2 = self.df.iloc[:-1] def time_full_product(self, dtype): @@ -255,7 +259,7 @@ class Cut: param_names = ["bins"] def setup(self, bins): - N = 10 ** 5 + N = 10**5 self.int_series = pd.Series(np.arange(N).repeat(5)) self.float_series = pd.Series(np.random.randn(N).repeat(5)) self.timedelta_series = pd.Series( diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index d35770b720f7a..d65a1a39e8bc7 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import pandas as pd @@ -7,22 +9,24 @@ class Methods: params = ( ["DataFrame", "Series"], - [10, 1000], + [("rolling", {"window": 10}), ("rolling", {"window": 1000}), ("expanding", {})], ["int", "float"], - ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], + ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum", "sem"], ) - param_names = ["constructor", "window", "dtype", "method"] + param_names = ["constructor", "window_kwargs", "dtype", "method"] - def setup(self, constructor, window, dtype, method): - N = 10 ** 5 + def setup(self, constructor, window_kwargs, dtype, method): + N = 10**5 + window, kwargs = window_kwargs arr = (100 * np.random.random(N)).astype(dtype) - self.roll = getattr(pd, constructor)(arr).rolling(window) + obj = getattr(pd, constructor)(arr) + self.window = getattr(obj, window)(**kwargs) - def time_rolling(self, constructor, window, dtype, method): - getattr(self.roll, method)() + def time_method(self, constructor, window_kwargs, dtype, method): + getattr(self.window, method)() - def peakmem_rolling(self, constructor, window, dtype, method): - getattr(self.roll, method)() + def peakmem_method(self, constructor, window_kwargs, dtype, method): + getattr(self.window, method)() class Apply: @@ -36,7 +40,7 @@ class Apply: param_names = ["constructor", "window", "dtype", "function", "raw"] def setup(self, constructor, window, dtype, function, raw): - N = 10 ** 3 + N = 10**3 arr = (100 * np.random.random(N)).astype(dtype) self.roll = getattr(pd, constructor)(arr).rolling(window) @@ -44,77 +48,116 @@ def time_rolling(self, constructor, window, dtype, function, raw): self.roll.apply(function, raw=raw) -class Engine: +class NumbaEngineMethods: params = ( ["DataFrame", "Series"], ["int", "float"], - [np.sum, lambda x: np.sum(x) + 5], - ["cython", "numba"], - ["sum", "max", "min", "median", "mean"], + [("rolling", {"window": 10}), ("expanding", {})], + ["sum", "max", "min", "median", "mean", "var", "std"], + [True, False], + [None, 100], ) - param_names = ["constructor", "dtype", "function", "engine", "method"] - - def setup(self, constructor, dtype, function, engine, method): - N = 10 ** 3 - arr = (100 * np.random.random(N)).astype(dtype) - self.data = getattr(pd, constructor)(arr) - - def time_rolling_apply(self, constructor, dtype, function, engine, method): - self.data.rolling(10).apply(function, raw=True, engine=engine) - - def time_expanding_apply(self, constructor, dtype, function, engine, method): - self.data.expanding().apply(function, raw=True, engine=engine) - - def time_rolling_methods(self, constructor, dtype, function, engine, method): - getattr(self.data.rolling(10), method)(engine=engine) - - -class ExpandingMethods: - + param_names = [ + "constructor", + "dtype", + "window_kwargs", + "method", + "parallel", + "cols", + ] + + def setup(self, constructor, dtype, window_kwargs, method, parallel, cols): + N = 10**3 + window, kwargs = window_kwargs + shape = (N, cols) if cols is not None and constructor != "Series" else N + arr = (100 * np.random.random(shape)).astype(dtype) + data = getattr(pd, constructor)(arr) + + # Warm the cache + with warnings.catch_warnings(record=True): + # Catch parallel=True not being applicable e.g. 1D data + self.window = getattr(data, window)(**kwargs) + getattr(self.window, method)( + engine="numba", engine_kwargs={"parallel": parallel} + ) + + def test_method(self, constructor, dtype, window_kwargs, method, parallel, cols): + with warnings.catch_warnings(record=True): + getattr(self.window, method)( + engine="numba", engine_kwargs={"parallel": parallel} + ) + + +class NumbaEngineApply: params = ( ["DataFrame", "Series"], ["int", "float"], - ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], + [("rolling", {"window": 10}), ("expanding", {})], + [np.sum, lambda x: np.sum(x) + 5], + [True, False], + [None, 100], ) - param_names = ["constructor", "window", "dtype", "method"] - - def setup(self, constructor, dtype, method): - N = 10 ** 5 - N_groupby = 100 - arr = (100 * np.random.random(N)).astype(dtype) - self.expanding = getattr(pd, constructor)(arr).expanding() - self.expanding_groupby = ( - pd.DataFrame({"A": arr[:N_groupby], "B": range(N_groupby)}) - .groupby("B") - .expanding() - ) - - def time_expanding(self, constructor, dtype, method): - getattr(self.expanding, method)() - - def time_expanding_groupby(self, constructor, dtype, method): - getattr(self.expanding_groupby, method)() + param_names = [ + "constructor", + "dtype", + "window_kwargs", + "function", + "parallel", + "cols", + ] + + def setup(self, constructor, dtype, window_kwargs, function, parallel, cols): + N = 10**3 + window, kwargs = window_kwargs + shape = (N, cols) if cols is not None and constructor != "Series" else N + arr = (100 * np.random.random(shape)).astype(dtype) + data = getattr(pd, constructor)(arr) + + # Warm the cache + with warnings.catch_warnings(record=True): + # Catch parallel=True not being applicable e.g. 1D data + self.window = getattr(data, window)(**kwargs) + self.window.apply( + function, raw=True, engine="numba", engine_kwargs={"parallel": parallel} + ) + + def test_method(self, constructor, dtype, window_kwargs, function, parallel, cols): + with warnings.catch_warnings(record=True): + self.window.apply( + function, raw=True, engine="numba", engine_kwargs={"parallel": parallel} + ) class EWMMethods: - params = (["DataFrame", "Series"], [10, 1000], ["int", "float"], ["mean", "std"]) - param_names = ["constructor", "window", "dtype", "method"] + params = ( + ["DataFrame", "Series"], + [ + ({"halflife": 10}, "mean"), + ({"halflife": 10}, "std"), + ({"halflife": 1000}, "mean"), + ({"halflife": 1000}, "std"), + ( + { + "halflife": "1 Day", + "times": pd.date_range("1900", periods=10**5, freq="23s"), + }, + "mean", + ), + ], + ["int", "float"], + ) + param_names = ["constructor", "kwargs_method", "dtype"] - def setup(self, constructor, window, dtype, method): - N = 10 ** 5 + def setup(self, constructor, kwargs_method, dtype): + N = 10**5 + kwargs, method = kwargs_method arr = (100 * np.random.random(N)).astype(dtype) - times = pd.date_range("1900", periods=N, freq="23s") - self.ewm = getattr(pd, constructor)(arr).ewm(halflife=window) - self.ewm_times = getattr(pd, constructor)(arr).ewm( - halflife="1 Day", times=times - ) - - def time_ewm(self, constructor, window, dtype, method): - getattr(self.ewm, method)() + self.method = method + self.ewm = getattr(pd, constructor)(arr).ewm(**kwargs) - def time_ewm_times(self, constructor, window, dtype, method): - self.ewm_times.mean() + def time_ewm(self, constructor, kwargs_method, dtype): + getattr(self.ewm, self.method)() class VariableWindowMethods(Methods): @@ -122,43 +165,43 @@ class VariableWindowMethods(Methods): ["DataFrame", "Series"], ["50s", "1h", "1d"], ["int", "float"], - ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], + ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum", "sem"], ) param_names = ["constructor", "window", "dtype", "method"] def setup(self, constructor, window, dtype, method): - N = 10 ** 5 + N = 10**5 arr = (100 * np.random.random(N)).astype(dtype) index = pd.date_range("2017-01-01", periods=N, freq="5s") - self.roll = getattr(pd, constructor)(arr, index=index).rolling(window) + self.window = getattr(pd, constructor)(arr, index=index).rolling(window) class Pairwise: - params = ([10, 1000, None], ["corr", "cov"], [True, False]) - param_names = ["window", "method", "pairwise"] + params = ( + [({"window": 10}, "rolling"), ({"window": 1000}, "rolling"), ({}, "expanding")], + ["corr", "cov"], + [True, False], + ) + param_names = ["window_kwargs", "method", "pairwise"] - def setup(self, window, method, pairwise): - N = 10 ** 4 + def setup(self, kwargs_window, method, pairwise): + N = 10**4 n_groups = 20 + kwargs, window = kwargs_window groups = [i for _ in range(N // n_groups) for i in range(n_groups)] arr = np.random.random(N) self.df = pd.DataFrame(arr) - self.df_group = pd.DataFrame({"A": groups, "B": arr}).groupby("A") + self.window = getattr(self.df, window)(**kwargs) + self.window_group = getattr( + pd.DataFrame({"A": groups, "B": arr}).groupby("A"), window + )(**kwargs) - def time_pairwise(self, window, method, pairwise): - if window is None: - r = self.df.expanding() - else: - r = self.df.rolling(window=window) - getattr(r, method)(self.df, pairwise=pairwise) + def time_pairwise(self, kwargs_window, method, pairwise): + getattr(self.window, method)(self.df, pairwise=pairwise) - def time_groupby(self, window, method, pairwise): - if window is None: - r = self.df_group.expanding() - else: - r = self.df_group.rolling(window=window) - getattr(r, method)(self.df, pairwise=pairwise) + def time_groupby(self, kwargs_window, method, pairwise): + getattr(self.window_group, method)(self.df, pairwise=pairwise) class Quantile: @@ -172,7 +215,7 @@ class Quantile: param_names = ["constructor", "window", "dtype", "percentile"] def setup(self, constructor, window, dtype, percentile, interpolation): - N = 10 ** 5 + N = 10**5 arr = np.random.random(N).astype(dtype) self.roll = getattr(pd, constructor)(arr).rolling(window) @@ -180,12 +223,39 @@ def time_quantile(self, constructor, window, dtype, percentile, interpolation): self.roll.quantile(percentile, interpolation=interpolation) +class Rank: + params = ( + ["DataFrame", "Series"], + [10, 1000], + ["int", "float"], + [True, False], + [True, False], + ["min", "max", "average"], + ) + param_names = [ + "constructor", + "window", + "dtype", + "percentile", + "ascending", + "method", + ] + + def setup(self, constructor, window, dtype, percentile, ascending, method): + N = 10**5 + arr = np.random.random(N).astype(dtype) + self.roll = getattr(pd, constructor)(arr).rolling(window) + + def time_rank(self, constructor, window, dtype, percentile, ascending, method): + self.roll.rank(pct=percentile, ascending=ascending, method=method) + + class PeakMemFixedWindowMinMax: params = ["min", "max"] def setup(self, operation): - N = 10 ** 6 + N = 10**6 arr = np.random.random(N) self.roll = pd.Series(arr).rolling(2) @@ -204,7 +274,7 @@ class ForwardWindowMethods: param_names = ["constructor", "window_size", "dtype", "method"] def setup(self, constructor, window_size, dtype, method): - N = 10 ** 5 + N = 10**5 arr = np.random.random(N).astype(dtype) indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=window_size) self.roll = getattr(pd, constructor)(arr).rolling(window=indexer) @@ -218,10 +288,18 @@ def peakmem_rolling(self, constructor, window_size, dtype, method): class Groupby: - params = ["sum", "median", "mean", "max", "min", "kurt", "sum"] + params = ( + ["sum", "median", "mean", "max", "min", "kurt", "sum"], + [ + ("rolling", {"window": 2}), + ("rolling", {"window": "30s", "on": "C"}), + ("expanding", {}), + ], + ) - def setup(self, method): + def setup(self, method, window_kwargs): N = 1000 + window, kwargs = window_kwargs df = pd.DataFrame( { "A": [str(i) for i in range(N)] * 10, @@ -229,14 +307,10 @@ def setup(self, method): "C": pd.date_range(start="1900-01-01", freq="1min", periods=N * 10), } ) - self.groupby_roll_int = df.groupby("A").rolling(window=2) - self.groupby_roll_offset = df.groupby("A").rolling(window="30s", on="C") - - def time_rolling_int(self, method): - getattr(self.groupby_roll_int, method)() + self.groupby_window = getattr(df.groupby("A"), window)(**kwargs) - def time_rolling_offset(self, method): - getattr(self.groupby_roll_offset, method)() + def time_method(self, method, window_kwargs): + getattr(self.groupby_window, method)() class GroupbyLargeGroups: @@ -296,5 +370,8 @@ def time_apply(self, method): table_method_func, raw=True, engine="numba" ) + def time_ewm_mean(self, method): + self.df.ewm(1, method=method).mean(engine="numba") + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 7592ce54e3712..09c318af76159 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -3,6 +3,7 @@ import numpy as np from pandas import ( + Index, NaT, Series, date_range, @@ -12,20 +13,36 @@ class SeriesConstructor: - - params = [None, "dict"] - param_names = ["data"] - - def setup(self, data): + def setup(self): self.idx = date_range( start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s" ) - dict_data = dict(zip(self.idx, range(len(self.idx)))) - self.data = None if data is None else dict_data + self.data = dict(zip(self.idx, range(len(self.idx)))) + self.array = np.array([1, 2, 3]) + self.idx2 = Index(["a", "b", "c"]) - def time_constructor(self, data): + def time_constructor_dict(self): Series(data=self.data, index=self.idx) + def time_constructor_no_data(self): + Series(data=None, index=self.idx) + + def time_constructor_fastpath(self): + Series(self.array, index=self.idx2, name="name", fastpath=True) + + +class ToFrame: + params = [["int64", "datetime64[ns]", "category", "Int64"], [None, "foo"]] + param_names = ["dtype", "name"] + + def setup(self, dtype, name): + arr = np.arange(10**5) + ser = Series(arr, dtype=dtype) + self.ser = ser + + def time_to_frame(self, dtype, name): + self.ser.to_frame(name) + class NSort: @@ -48,7 +65,7 @@ class Dropna: param_names = ["dtype"] def setup(self, dtype): - N = 10 ** 6 + N = 10**6 data = { "int": np.random.randint(1, 10, N), "datetime": date_range("2000-01-01", freq="S", periods=N), @@ -81,7 +98,7 @@ class SearchSorted: param_names = ["dtype"] def setup(self, dtype): - N = 10 ** 5 + N = 10**5 data = np.array([1] * N + [2] * N + [3] * N).astype(dtype) self.s = Series(data) @@ -117,7 +134,7 @@ def time_map(self, mapper, *args, **kwargs): class Clip: - params = [50, 1000, 10 ** 5] + params = [50, 1000, 10**5] param_names = ["n"] def setup(self, n): @@ -127,9 +144,19 @@ def time_clip(self, n): self.s.clip(0, 1) +class ClipDt: + def setup(self): + dr = date_range("20220101", periods=100_000, freq="s", tz="UTC") + self.clipper_dt = dr[0:1_000].repeat(100) + self.s = Series(dr) + + def time_clip(self): + self.s.clip(upper=self.clipper_dt) + + class ValueCounts: - params = [[10 ** 3, 10 ** 4, 10 ** 5], ["int", "uint", "float", "object"]] + params = [[10**3, 10**4, 10**5], ["int", "uint", "float", "object"]] param_names = ["N", "dtype"] def setup(self, N, dtype): @@ -139,9 +166,21 @@ def time_value_counts(self, N, dtype): self.s.value_counts() +class ValueCountsObjectDropNAFalse: + + params = [10**3, 10**4, 10**5] + param_names = ["N"] + + def setup(self, N): + self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object") + + def time_value_counts(self, N): + self.s.value_counts(dropna=False) + + class Mode: - params = [[10 ** 3, 10 ** 4, 10 ** 5], ["int", "uint", "float", "object"]] + params = [[10**3, 10**4, 10**5], ["int", "uint", "float", "object"]] param_names = ["N", "dtype"] def setup(self, N, dtype): @@ -151,6 +190,18 @@ def time_mode(self, N, dtype): self.s.mode() +class ModeObjectDropNAFalse: + + params = [10**3, 10**4, 10**5] + param_names = ["N"] + + def setup(self, N): + self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object") + + def time_mode(self, N): + self.s.mode(dropna=False) + + class Dir: def setup(self): self.s = Series(index=tm.makeStringIndex(10000)) @@ -162,7 +213,7 @@ def time_dir_strings(self): class SeriesGetattr: # https://github.com/pandas-dev/pandas/issues/19764 def setup(self): - self.s = Series(1, index=date_range("2012-01-01", freq="s", periods=10 ** 6)) + self.s = Series(1, index=date_range("2012-01-01", freq="s", periods=10**6)) def time_series_datetimeindex_repr(self): getattr(self.s, "a", None) @@ -170,7 +221,7 @@ def time_series_datetimeindex_repr(self): class All: - params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]] + params = [[10**3, 10**6], ["fast", "slow"], ["bool", "boolean"]] param_names = ["N", "case", "dtype"] def setup(self, N, case, dtype): @@ -183,7 +234,7 @@ def time_all(self, N, case, dtype): class Any: - params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]] + params = [[10**3, 10**6], ["fast", "slow"], ["bool", "boolean"]] param_names = ["N", "case", "dtype"] def setup(self, N, case, dtype): @@ -211,7 +262,7 @@ class NanOps: "kurt", "prod", ], - [10 ** 3, 10 ** 6], + [10**3, 10**6], ["int8", "int32", "int64", "float64", "Int64", "boolean"], ] param_names = ["func", "N", "dtype"] diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 35e5818cd3b2b..10390cb4493cd 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -40,7 +40,7 @@ class SparseArrayConstructor: param_names = ["dense_proportion", "fill_value", "dtype"] def setup(self, dense_proportion, fill_value, dtype): - N = 10 ** 6 + N = 10**6 self.array = make_array(N, dense_proportion, fill_value, dtype) def time_sparse_array(self, dense_proportion, fill_value, dtype): @@ -67,16 +67,42 @@ def time_sparse_series_from_coo(self): class ToCoo: - def setup(self): + params = [True, False] + param_names = ["sort_labels"] + + def setup(self, sort_labels): s = Series([np.nan] * 10000) s[0] = 3.0 s[100] = -1.0 s[999] = 12.1 - s.index = MultiIndex.from_product([range(10)] * 4) - self.ss = s.astype("Sparse") - def time_sparse_series_to_coo(self): - self.ss.sparse.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True) + s_mult_lvl = s.set_axis(MultiIndex.from_product([range(10)] * 4)) + self.ss_mult_lvl = s_mult_lvl.astype("Sparse") + + s_two_lvl = s.set_axis(MultiIndex.from_product([range(100)] * 2)) + self.ss_two_lvl = s_two_lvl.astype("Sparse") + + def time_sparse_series_to_coo(self, sort_labels): + self.ss_mult_lvl.sparse.to_coo( + row_levels=[0, 1], column_levels=[2, 3], sort_labels=sort_labels + ) + + def time_sparse_series_to_coo_single_level(self, sort_labels): + self.ss_two_lvl.sparse.to_coo(sort_labels=sort_labels) + + +class ToCooFrame: + def setup(self): + N = 10000 + k = 10 + arr = np.zeros((N, k), dtype=float) + arr[0, 0] = 3.0 + arr[12, 7] = -1.0 + arr[0, 9] = 11.2 + self.df = pd.DataFrame(arr, dtype=pd.SparseDtype("float", fill_value=0.0)) + + def time_to_coo(self): + self.df.sparse.to_coo() class Arithmetic: @@ -85,7 +111,7 @@ class Arithmetic: param_names = ["dense_proportion", "fill_value"] def setup(self, dense_proportion, fill_value): - N = 10 ** 6 + N = 10**6 arr1 = make_array(N, dense_proportion, fill_value, np.int64) self.array1 = SparseArray(arr1, fill_value=fill_value) arr2 = make_array(N, dense_proportion, fill_value, np.int64) @@ -110,7 +136,7 @@ class ArithmeticBlock: param_names = ["fill_value"] def setup(self, fill_value): - N = 10 ** 6 + N = 10**6 self.arr1 = self.make_block_array( length=N, num_blocks=1000, block_size=10, fill_value=fill_value ) @@ -120,10 +146,10 @@ def setup(self, fill_value): def make_block_array(self, length, num_blocks, block_size, fill_value): arr = np.full(length, fill_value) - indicies = np.random.choice( + indices = np.random.choice( np.arange(0, length, block_size), num_blocks, replace=False ) - for ind in indicies: + for ind in indices: arr[ind : ind + block_size] = np.random.randint(0, 100, block_size) return SparseArray(arr, fill_value=fill_value) @@ -140,4 +166,68 @@ def time_division(self, fill_value): self.arr1 / self.arr2 +class MinMax: + + params = (["min", "max"], [0.0, np.nan]) + param_names = ["func", "fill_value"] + + def setup(self, func, fill_value): + N = 1_000_000 + arr = make_array(N, 1e-5, fill_value, np.float64) + self.sp_arr = SparseArray(arr, fill_value=fill_value) + + def time_min_max(self, func, fill_value): + getattr(self.sp_arr, func)() + + +class Take: + + params = ([np.array([0]), np.arange(100_000), np.full(100_000, -1)], [True, False]) + param_names = ["indices", "allow_fill"] + + def setup(self, indices, allow_fill): + N = 1_000_000 + fill_value = 0.0 + arr = make_array(N, 1e-5, fill_value, np.float64) + self.sp_arr = SparseArray(arr, fill_value=fill_value) + + def time_take(self, indices, allow_fill): + self.sp_arr.take(indices, allow_fill=allow_fill) + + +class GetItem: + def setup(self): + N = 1_000_000 + d = 1e-5 + arr = make_array(N, d, np.nan, np.float64) + self.sp_arr = SparseArray(arr) + + def time_integer_indexing(self): + self.sp_arr[78] + + def time_slice(self): + self.sp_arr[1:] + + +class GetItemMask: + + params = [True, False, np.nan] + param_names = ["fill_value"] + + def setup(self, fill_value): + N = 1_000_000 + d = 1e-5 + arr = make_array(N, d, np.nan, np.float64) + self.sp_arr = SparseArray(arr) + b_arr = np.full(shape=N, fill_value=fill_value, dtype=np.bool_) + fv_inds = np.unique( + np.random.randint(low=0, high=N - 1, size=int(N * d), dtype=np.int32) + ) + b_arr[fv_inds] = True if pd.isna(fill_value) else not fill_value + self.sp_b_arr = SparseArray(b_arr, dtype=np.bool_, fill_value=fill_value) + + def time_mask(self, fill_value): + self.sp_arr[self.sp_b_arr] + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 5639d6702a92c..92a78b7c2f63d 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -83,7 +83,7 @@ class Rank: param_names = ["constructor", "pct"] def setup(self, constructor, pct): - values = np.random.randn(10 ** 5) + values = np.random.randn(10**5) self.data = getattr(pd, constructor)(values) def time_rank(self, constructor, pct): diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py new file mode 100644 index 0000000000000..ac1b7f65d2d90 --- /dev/null +++ b/asv_bench/benchmarks/strftime.py @@ -0,0 +1,64 @@ +import numpy as np + +import pandas as pd +from pandas import offsets + + +class DatetimeStrftime: + timeout = 1500 + params = [1000, 10000] + param_names = ["obs"] + + def setup(self, obs): + d = "2018-11-29" + dt = "2018-11-26 11:18:27.0" + self.data = pd.DataFrame( + { + "dt": [np.datetime64(dt)] * obs, + "d": [np.datetime64(d)] * obs, + "r": [np.random.uniform()] * obs, + } + ) + + def time_frame_date_to_str(self, obs): + self.data["d"].astype(str) + + def time_frame_date_formatting_default(self, obs): + self.data["d"].dt.strftime(date_format="%Y-%m-%d") + + def time_frame_date_formatting_custom(self, obs): + self.data["d"].dt.strftime(date_format="%Y---%m---%d") + + def time_frame_datetime_to_str(self, obs): + self.data["dt"].astype(str) + + def time_frame_datetime_formatting_default_date_only(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d") + + def time_frame_datetime_formatting_default(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S") + + def time_frame_datetime_formatting_default_with_float(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f") + + def time_frame_datetime_formatting_custom(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") + + +class BusinessHourStrftime: + timeout = 1500 + params = [1000, 10000] + param_names = ["obs"] + + def setup(self, obs): + self.data = pd.DataFrame( + { + "off": [offsets.BusinessHour()] * obs, + } + ) + + def time_frame_offset_str(self, obs): + self.data["off"].apply(str) + + def time_frame_offset_repr(self, obs): + self.data["off"].apply(repr) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 32fbf4e6c7de3..eec722c9f167b 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -3,10 +3,12 @@ import numpy as np from pandas import ( + NA, Categorical, DataFrame, Series, ) +from pandas.arrays import StringArray from .pandas_vb_common import tm @@ -17,7 +19,7 @@ class Dtypes: def setup(self, dtype): try: - self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) + self.s = Series(tm.makeStringIndex(10**5), dtype=dtype) except ImportError: raise NotImplementedError @@ -28,7 +30,7 @@ class Construction: param_names = ["dtype"] def setup(self, dtype): - self.series_arr = tm.rands_array(nchars=10, size=10 ** 5) + self.series_arr = tm.rands_array(nchars=10, size=10**5) self.frame_arr = self.series_arr.reshape((50_000, 2)).copy() # GH37371. Testing construction of string series/frames from ExtensionArrays @@ -180,7 +182,7 @@ class Repeat: param_names = ["repeats"] def setup(self, repeats): - N = 10 ** 5 + N = 10**5 self.s = Series(tm.makeStringIndex(N)) repeat = {"int": 1, "array": np.random.randint(1, 3, N)} self.values = repeat[repeats] @@ -195,7 +197,7 @@ class Cat: param_names = ["other_cols", "sep", "na_rep", "na_frac"] def setup(self, other_cols, sep, na_rep, na_frac): - N = 10 ** 5 + N = 10**5 mask_gen = lambda: np.random.choice([True, False], N, p=[1 - na_frac, na_frac]) self.s = Series(tm.makeStringIndex(N)).where(mask_gen()) if other_cols == 0: @@ -266,7 +268,7 @@ def time_get_dummies(self, dtype): class Encode: def setup(self): - self.ser = Series(tm.makeUnicodeIndex()) + self.ser = Series(tm.makeStringIndex()) def time_encode_decode(self): self.ser.str.encode("utf-8").str.decode("utf-8") @@ -285,3 +287,18 @@ class Iter(Dtypes): def time_iter(self, dtype): for i in self.s: pass + + +class StringArrayConstruction: + def setup(self): + self.series_arr = tm.rands_array(nchars=10, size=10**5) + self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)]) + + def time_string_array_construction(self): + StringArray(self.series_arr) + + def time_string_array_with_nan_construction(self): + StringArray(self.series_arr_nan) + + def peakmem_stringarray_construction(self): + StringArray(self.series_arr) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 5b123c7127c28..9373edadb8e90 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -131,7 +131,7 @@ class Iteration: param_names = ["time_index"] def setup(self, time_index): - N = 10 ** 6 + N = 10**6 if time_index is timedelta_range: self.idx = time_index(start=0, freq="T", periods=N) else: @@ -247,7 +247,7 @@ class SortIndex: param_names = ["monotonic"] def setup(self, monotonic): - N = 10 ** 5 + N = 10**5 idx = date_range(start="1/1/2000", periods=N, freq="s") self.s = Series(np.random.randn(N), index=idx) if not monotonic: diff --git a/asv_bench/benchmarks/tslibs/fields.py b/asv_bench/benchmarks/tslibs/fields.py index 0607a799ec707..23ae73811204c 100644 --- a/asv_bench/benchmarks/tslibs/fields.py +++ b/asv_bench/benchmarks/tslibs/fields.py @@ -12,7 +12,7 @@ class TimeGetTimedeltaField: params = [ _sizes, - ["days", "h", "s", "seconds", "ms", "microseconds", "us", "ns", "nanoseconds"], + ["days", "seconds", "microseconds", "nanoseconds"], ] param_names = ["size", "field"] diff --git a/asv_bench/benchmarks/tslibs/normalize.py b/asv_bench/benchmarks/tslibs/normalize.py index f5f7adbf63995..b263ae21422b6 100644 --- a/asv_bench/benchmarks/tslibs/normalize.py +++ b/asv_bench/benchmarks/tslibs/normalize.py @@ -31,13 +31,15 @@ def setup(self, size, tz): dti = pd.date_range("2016-01-01", periods=10, tz=tz).repeat(size // 10) self.i8data = dti.asi8 - if size == 10 ** 6 and tz is tzlocal_obj: + if size == 10**6 and tz is tzlocal_obj: # tzlocal is cumbersomely slow, so skip to keep runtime in check raise NotImplementedError def time_normalize_i8_timestamps(self, size, tz): - normalize_i8_timestamps(self.i8data, tz) + # 10 i.e. NPY_FR_ns + normalize_i8_timestamps(self.i8data, tz, 10) def time_is_date_array_normalized(self, size, tz): # TODO: cases with different levels of short-circuiting - is_date_array_normalized(self.i8data, tz) + # 10 i.e. NPY_FR_ns + is_date_array_normalized(self.i8data, tz, 10) diff --git a/asv_bench/benchmarks/tslibs/offsets.py b/asv_bench/benchmarks/tslibs/offsets.py index 0aea8332398b1..978a36e470cbb 100644 --- a/asv_bench/benchmarks/tslibs/offsets.py +++ b/asv_bench/benchmarks/tslibs/offsets.py @@ -14,7 +14,7 @@ pass hcal = pandas.tseries.holiday.USFederalHolidayCalendar() -# These offsets currently raise a NotImplimentedError with .apply_index() +# These offsets currently raise a NotImplementedError with .apply_index() non_apply = [ offsets.Day(), offsets.BYearEnd(), diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py index 15a922da7ee76..af10102749627 100644 --- a/asv_bench/benchmarks/tslibs/period.py +++ b/asv_bench/benchmarks/tslibs/period.py @@ -1,6 +1,6 @@ """ -Period benchmarks that rely only on tslibs. See benchmarks.period for -Period benchmarks that rely on other parts fo pandas. +Period benchmarks that rely only on tslibs. See benchmarks.period for +Period benchmarks that rely on other parts of pandas. """ import numpy as np @@ -130,7 +130,7 @@ class TimeDT64ArrToPeriodArr: param_names = ["size", "freq", "tz"] def setup(self, size, freq, tz): - if size == 10 ** 6 and tz is tzlocal_obj: + if size == 10**6 and tz is tzlocal_obj: # tzlocal is cumbersomely slow, so skip to keep runtime in check raise NotImplementedError diff --git a/asv_bench/benchmarks/tslibs/resolution.py b/asv_bench/benchmarks/tslibs/resolution.py index 4b52efc188bf4..44f288c7de216 100644 --- a/asv_bench/benchmarks/tslibs/resolution.py +++ b/asv_bench/benchmarks/tslibs/resolution.py @@ -40,7 +40,7 @@ class TimeResolution: param_names = ["unit", "size", "tz"] def setup(self, unit, size, tz): - if size == 10 ** 6 and tz is tzlocal_obj: + if size == 10**6 and tz is tzlocal_obj: # tzlocal is cumbersomely slow, so skip to keep runtime in check raise NotImplementedError diff --git a/asv_bench/benchmarks/tslibs/timedelta.py b/asv_bench/benchmarks/tslibs/timedelta.py index 6ed273281569b..2daf1861eb80a 100644 --- a/asv_bench/benchmarks/tslibs/timedelta.py +++ b/asv_bench/benchmarks/tslibs/timedelta.py @@ -1,6 +1,6 @@ """ -Timedelta benchmarks that rely only on tslibs. See benchmarks.timedeltas for -Timedelta benchmarks that rely on other parts fo pandas. +Timedelta benchmarks that rely only on tslibs. See benchmarks.timedeltas for +Timedelta benchmarks that rely on other parts of pandas. """ import datetime diff --git a/asv_bench/benchmarks/tslibs/tslib.py b/asv_bench/benchmarks/tslibs/tslib.py index 180f95e7fbda5..f93ef1cef841f 100644 --- a/asv_bench/benchmarks/tslibs/tslib.py +++ b/asv_bench/benchmarks/tslibs/tslib.py @@ -41,7 +41,7 @@ gettz("Asia/Tokyo"), tzlocal_obj, ] -_sizes = [0, 1, 100, 10 ** 4, 10 ** 6] +_sizes = [0, 1, 100, 10**4, 10**6] class TimeIntsToPydatetime: @@ -57,7 +57,7 @@ def setup(self, box, size, tz): if box == "date" and tz is not None: # tz is ignored, so avoid running redundant benchmarks raise NotImplementedError # skip benchmark - if size == 10 ** 6 and tz is _tzs[-1]: + if size == 10**6 and tz is _tzs[-1]: # This is cumbersomely-slow, so skip to trim runtime raise NotImplementedError # skip benchmark diff --git a/asv_bench/benchmarks/tslibs/tz_convert.py b/asv_bench/benchmarks/tslibs/tz_convert.py index 793f43e9bbe35..c6b510efdca69 100644 --- a/asv_bench/benchmarks/tslibs/tz_convert.py +++ b/asv_bench/benchmarks/tslibs/tz_convert.py @@ -11,10 +11,14 @@ try: old_sig = False - from pandas._libs.tslibs.tzconversion import tz_convert_from_utc + from pandas._libs.tslibs import tz_convert_from_utc except ImportError: - old_sig = True - from pandas._libs.tslibs.tzconversion import tz_convert as tz_convert_from_utc + try: + old_sig = False + from pandas._libs.tslibs.tzconversion import tz_convert_from_utc + except ImportError: + old_sig = True + from pandas._libs.tslibs.tzconversion import tz_convert as tz_convert_from_utc class TimeTZConvert: @@ -25,7 +29,7 @@ class TimeTZConvert: param_names = ["size", "tz"] def setup(self, size, tz): - if size == 10 ** 6 and tz is tzlocal_obj: + if size == 10**6 and tz is tzlocal_obj: # tzlocal is cumbersomely slow, so skip to keep runtime in check raise NotImplementedError diff --git a/azure-pipelines.yml b/azure-pipelines.yml deleted file mode 100644 index 956feaef5f83e..0000000000000 --- a/azure-pipelines.yml +++ /dev/null @@ -1,53 +0,0 @@ -# Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml -trigger: - branches: - include: - - master - - 1.2.x - paths: - exclude: - - 'doc/*' - -pr: -- master -- 1.2.x - -variables: - PYTEST_WORKERS: auto - -jobs: -# Mac and Linux use the same template -- template: ci/azure/posix.yml - parameters: - name: macOS - vmImage: macOS-10.14 - -- template: ci/azure/windows.yml - parameters: - name: Windows - vmImage: vs2017-win2016 - -- job: py37_32bit - pool: - vmImage: ubuntu-18.04 - - steps: - - script: | - docker pull quay.io/pypa/manylinux2014_i686 - docker run -v $(pwd):/pandas quay.io/pypa/manylinux2014_i686 \ - /bin/bash -xc "cd pandas && \ - /opt/python/cp37-cp37m/bin/python -m venv ~/virtualenvs/pandas-dev && \ - . ~/virtualenvs/pandas-dev/bin/activate && \ - python -m pip install --no-deps -U pip wheel setuptools && \ - pip install cython numpy python-dateutil pytz pytest pytest-xdist hypothesis pytest-azurepipelines && \ - python setup.py build_ext -q -j2 && \ - python -m pip install --no-build-isolation -e . && \ - pytest -m 'not slow and not network and not clipboard' pandas --junitxml=test-data.xml" - displayName: 'Run 32-bit manylinux2014 Docker Build / Tests' - - - task: PublishTestResults@2 - condition: succeededOrFailed() - inputs: - testResultsFiles: '**/test-*.xml' - failTaskOnFailedTests: true - testRunTitle: 'Publish test results for Python 3.7-32 bit full Linux' diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml deleted file mode 100644 index 2caacf3a07290..0000000000000 --- a/ci/azure/posix.yml +++ /dev/null @@ -1,43 +0,0 @@ -parameters: - name: '' - vmImage: '' - -jobs: -- job: ${{ parameters.name }} - pool: - vmImage: ${{ parameters.vmImage }} - strategy: - matrix: - ${{ if eq(parameters.name, 'macOS') }}: - py37_macos: - ENV_FILE: ci/deps/azure-macos-37.yaml - CONDA_PY: "37" - PATTERN: "not slow and not network" - - steps: - - script: echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' - displayName: 'Set conda path' - - - script: ci/setup_env.sh - displayName: 'Setup environment and build pandas' - - - script: | - source activate pandas-dev - ci/run_tests.sh - displayName: 'Test' - - - script: source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - displayName: 'Build versions' - - - task: PublishTestResults@2 - condition: succeededOrFailed() - inputs: - failTaskOnFailedTests: true - testResultsFiles: 'test-data.xml' - testRunTitle: ${{ format('{0}-$(CONDA_PY)', parameters.name) }} - displayName: 'Publish test results' - - - script: | - source activate pandas-dev - python ci/print_skipped.py - displayName: 'Print skipped tests' diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml deleted file mode 100644 index 5644ad46714d5..0000000000000 --- a/ci/azure/windows.yml +++ /dev/null @@ -1,57 +0,0 @@ -parameters: - name: '' - vmImage: '' - -jobs: -- job: ${{ parameters.name }} - pool: - vmImage: ${{ parameters.vmImage }} - strategy: - matrix: - py37_np17: - ENV_FILE: ci/deps/azure-windows-37.yaml - CONDA_PY: "37" - PATTERN: "not slow and not network" - - py38_np18: - ENV_FILE: ci/deps/azure-windows-38.yaml - CONDA_PY: "38" - PATTERN: "not slow and not network and not high_memory" - - steps: - - powershell: | - Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" - Write-Host "##vso[task.prependpath]$HOME/miniconda3/bin" - displayName: 'Add conda to PATH' - - - script: conda update -q -n base conda - displayName: 'Update conda' - - - bash: | - conda env create -q --file ci\\deps\\azure-windows-$(CONDA_PY).yaml - displayName: 'Create anaconda environment' - - - bash: | - source activate pandas-dev - conda list - python setup.py build_ext -q -j 4 - python -m pip install --no-build-isolation -e . - displayName: 'Build' - - - bash: | - source activate pandas-dev - ci/run_tests.sh - displayName: 'Test' - - - task: PublishTestResults@2 - condition: succeededOrFailed() - inputs: - failTaskOnFailedTests: true - testResultsFiles: 'test-data.xml' - testRunTitle: ${{ format('{0}-$(CONDA_PY)', parameters.name) }} - displayName: 'Publish test results' - - - bash: | - source activate pandas-dev - python ci/print_skipped.py - displayName: 'Print skipped tests' diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 1844cb863c183..113186c746157 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -3,22 +3,18 @@ # Run checks related to code quality. # # This script is intended for both the CI and to check locally that code standards are -# respected. We are currently linting (PEP-8 and similar), looking for patterns of -# common mistakes (sphinx directives with missing blank lines, old style classes, -# unwanted imports...), we run doctests here (currently some files only), and we +# respected. We run doctests here (currently some files only), and we # validate formatting error in docstrings. # # Usage: # $ ./ci/code_checks.sh # run all checks -# $ ./ci/code_checks.sh lint # run linting only -# $ ./ci/code_checks.sh patterns # check for patterns that should not exist # $ ./ci/code_checks.sh code # checks on imported code # $ ./ci/code_checks.sh doctests # run doctests # $ ./ci/code_checks.sh docstrings # validate docstring errors -# $ ./ci/code_checks.sh typing # run static type analysis +# $ ./ci/code_checks.sh single-docs # check single-page docs build warning-free -[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "typing" ]] || \ - { echo "Unknown command $1. Usage: $0 [lint|patterns|code|doctests|docstrings|typing]"; exit 9999; } +[[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" ]] || \ + { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings]"; exit 9999; } BASE_DIR="$(dirname $0)/.." RET=0 @@ -38,49 +34,7 @@ function invgrep { } if [[ "$GITHUB_ACTIONS" == "true" ]]; then - FLAKE8_FORMAT="##[error]%(path)s:%(row)s:%(col)s:%(code)s:%(text)s" INVGREP_PREPEND="##[error]" -else - FLAKE8_FORMAT="default" -fi - -### LINTING ### -if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then - - # Check that cython casting is of the form `obj` as opposed to ` obj`; - # it doesn't make a difference, but we want to be internally consistent. - # Note: this grep pattern is (intended to be) equivalent to the python - # regex r'(?])> ' - MSG='Linting .pyx code for spacing conventions in casting' ; echo $MSG - invgrep -r -E --include '*.pyx' --include '*.pxi.in' '[a-zA-Z0-9*]> ' pandas/_libs - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # readability/casting: Warnings about C casting instead of C++ casting - # runtime/int: Warnings about using C number types instead of C++ ones - # build/include_subdir: Warnings about prefacing included header files with directory - -fi - -### PATTERNS ### -if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then - - # Check for the following code in the extension array base tests: `tm.assert_frame_equal` and `tm.assert_series_equal` - MSG='Check for invalid EA testing' ; echo $MSG - invgrep -r -E --include '*.py' --exclude base.py 'tm.assert_(series|frame)_equal' pandas/tests/extension/base - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for deprecated messages without sphinx directive' ; echo $MSG - invgrep -R --include="*.py" --include="*.pyx" -E "(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.)" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for backticks incorrectly rendering because of missing spaces' ; echo $MSG - invgrep -R --include="*.rst" -E "[a-zA-Z0-9]\`\`?[a-zA-Z0-9]" doc/source/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for unnecessary random seeds in asv benchmarks' ; echo $MSG - invgrep -R --exclude pandas_vb_common.py -E 'np.random.seed' asv_bench/benchmarks/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - fi ### CODE ### @@ -110,45 +64,13 @@ fi ### DOCTESTS ### if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then - MSG='Doctests for individual files' ; echo $MSG - pytest -q --doctest-modules \ - pandas/core/accessor.py \ - pandas/core/aggregation.py \ - pandas/core/algorithms.py \ - pandas/core/base.py \ - pandas/core/construction.py \ - pandas/core/frame.py \ - pandas/core/generic.py \ - pandas/core/indexers.py \ - pandas/core/nanops.py \ - pandas/core/series.py \ - pandas/io/sql.py + MSG='Doctests' ; echo $MSG + # Ignore test_*.py files or else the unit tests will run + python -m pytest --doctest-modules --ignore-glob="**/test_*.py" pandas RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests for directories' ; echo $MSG - pytest -q --doctest-modules \ - pandas/_libs/ \ - pandas/api/ \ - pandas/arrays/ \ - pandas/compat/ \ - pandas/core/array_algos/ \ - pandas/core/arrays/ \ - pandas/core/computation/ \ - pandas/core/dtypes/ \ - pandas/core/groupby/ \ - pandas/core/indexes/ \ - pandas/core/ops/ \ - pandas/core/reshape/ \ - pandas/core/strings/ \ - pandas/core/tools/ \ - pandas/core/window/ \ - pandas/errors/ \ - pandas/io/clipboard/ \ - pandas/io/json/ \ - pandas/io/excel/ \ - pandas/io/parsers/ \ - pandas/io/sas/ \ - pandas/tseries/ + MSG='Cython Doctests' ; echo $MSG + python -m pytest --doctest-cython pandas/_libs RET=$(($RET + $?)) ; echo $MSG "DONE" fi @@ -156,21 +78,17 @@ fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS01, SS02, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA02, SA03)' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS02,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA02,SA03 + MSG='Validate docstrings (EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05, SS06)' ; echo $MSG + $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06 RET=$(($RET + $?)) ; echo $MSG "DONE" fi -### TYPING ### -if [[ -z "$CHECK" || "$CHECK" == "typing" ]]; then - - echo "mypy --version" - mypy --version - - MSG='Performing static analysis using mypy' ; echo $MSG - mypy pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" +### SINGLE-PAGE DOCS ### +if [[ -z "$CHECK" || "$CHECK" == "single-docs" ]]; then + python doc/make.py --warnings-are-errors --single pandas.Series.value_counts + python doc/make.py --warnings-are-errors --single pandas.Series.str.split + python doc/make.py clean fi exit $RET diff --git a/ci/condarc.yml b/ci/condarc.yml new file mode 100644 index 0000000000000..9d750b7102c39 --- /dev/null +++ b/ci/condarc.yml @@ -0,0 +1,32 @@ +# https://docs.conda.io/projects/conda/en/latest/configuration.html + +# always_yes (NoneType, bool) +# aliases: yes +# Automatically choose the 'yes' option whenever asked to proceed with a +# conda operation, such as when running `conda install`. +# +always_yes: true + +# remote_connect_timeout_secs (float) +# The number seconds conda will wait for your client to establish a +# connection to a remote url resource. +# +remote_connect_timeout_secs: 30.0 + +# remote_max_retries (int) +# The maximum number of retries each HTTP connection should attempt. +# +remote_max_retries: 10 + +# remote_backoff_factor (int) +# The factor determines the time HTTP connection should wait for +# attempt. +# +remote_backoff_factor: 3 + +# remote_read_timeout_secs (float) +# Once conda has connected to a remote resource and sent an HTTP +# request, the read timeout is the number of seconds conda will wait for +# the server to send a response. +# +remote_read_timeout_secs: 60.0 diff --git a/ci/deps/actions-38-numpydev.yaml b/ci/deps/actions-310-numpydev.yaml similarity index 64% rename from ci/deps/actions-38-numpydev.yaml rename to ci/deps/actions-310-numpydev.yaml index 6eed2daac0c3b..ef20c2aa889b9 100644 --- a/ci/deps/actions-38-numpydev.yaml +++ b/ci/deps/actions-310-numpydev.yaml @@ -2,20 +2,21 @@ name: pandas-dev channels: - defaults dependencies: - - python=3.8.* + - python=3.10 # tools - pytest>=6.0 - pytest-cov - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 + - pytest-xdist>=1.31 + - hypothesis>=5.5.3 + - pytest-asyncio>=0.17 # pandas dependencies + - python-dateutil - pytz - pip - pip: - - cython==0.29.21 # GH#34014 - - "git+git://github.com/dateutil/dateutil.git" + - "cython" - "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple" - "--pre" - "numpy" diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml new file mode 100644 index 0000000000000..deb23d435bddf --- /dev/null +++ b/ci/deps/actions-310.yaml @@ -0,0 +1,55 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.10 + + # test dependencies + - cython>=0.29.32 + - pytest>=6.0 + - pytest-cov + - pytest-xdist>=1.31 + - psutil + - pytest-asyncio>=0.17 + - boto3 + + # required dependencies + - python-dateutil + - numpy + - pytz + + # optional dependencies + - beautifulsoup4 + - blosc + - bottleneck + - brotlipy + - fastparquet + - fsspec + - html5lib + - hypothesis + - gcsfs + - jinja2 + - lxml + - matplotlib>=3.6.1 + - numba + - numexpr + - openpyxl + - odfpy + - pandas-gbq + - psycopg2 + - pymysql + - pytables + - pyarrow<10 + - pyreadstat + - python-snappy + - pyxlsb + - s3fs>=2021.08.0 + - scipy + - sqlalchemy<1.4.46 + - tabulate + - tzdata>=2022a + - xarray + - xlrd + - xlsxwriter + - xlwt + - zstandard diff --git a/ci/deps/actions-37-db-min.yaml b/ci/deps/actions-37-db-min.yaml deleted file mode 100644 index cae4361ca37a7..0000000000000 --- a/ci/deps/actions-37-db-min.yaml +++ /dev/null @@ -1,48 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.7.* - - # tools - - cython>=0.29.21 - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - # required - - numpy<1.20 # GH#39541 compat for pyarrow<3 - - python-dateutil - - pytz - - # optional - - beautifulsoup4 - - blosc=1.17.0 - - python-blosc - - fastparquet=0.4.0 - - html5lib - - ipython - - jinja2 - - lxml=4.3.0 - - matplotlib - - nomkl - - numexpr - - openpyxl - - pandas-gbq - - google-cloud-bigquery>=1.27.2 # GH 36436 - - protobuf>=3.12.4 - - pyarrow=0.17.1 # GH 38803 - - pytables>=3.5.1 - - scipy - - xarray=0.12.3 - - xlrd<2.0 - - xlsxwriter - - xlwt - - moto - - flask - - # sql - - psycopg2=2.7 - - pymysql=0.8.1 - - sqlalchemy=1.3.0 diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml deleted file mode 100644 index e568f8615a8df..0000000000000 --- a/ci/deps/actions-37-db.yaml +++ /dev/null @@ -1,54 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.7.* - - # tools - - cython>=0.29.21 - - pytest>=6.0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - pytest-cov>=2.10.1 # this is only needed in the coverage build, ref: GH 35737 - - # pandas dependencies - - beautifulsoup4 - - botocore>=1.11 - - dask - - fastparquet>=0.4.0 - - fsspec>=0.7.4 - - gcsfs>=0.6.0 - - geopandas - - html5lib - - matplotlib - - moto>=1.3.14 - - flask - - nomkl - - numexpr - - numpy=1.17.* - - odfpy - - openpyxl - - pandas-gbq - - google-cloud-bigquery>=1.27.2 # GH 36436 - - psycopg2 - - pyarrow>=0.17.0 - - pymysql - - pytables - - python-snappy - - python-dateutil - - pytz - - s3fs>=0.4.0 - - scikit-learn - - scipy - - sqlalchemy - - statsmodels - - xarray - - xlrd<2.0 - - xlsxwriter - - xlwt - - pip - - pip: - - brotlipy - - coverage - - pandas-datareader - - pyxlsb diff --git a/ci/deps/actions-37-locale_slow.yaml b/ci/deps/actions-37-locale_slow.yaml deleted file mode 100644 index c6eb3b00a63ac..0000000000000 --- a/ci/deps/actions-37-locale_slow.yaml +++ /dev/null @@ -1,30 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.7.* - - # tools - - cython>=0.29.21 - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - # pandas dependencies - - beautifulsoup4=4.6.0 - - bottleneck=1.2.* - - lxml - - matplotlib=3.0.0 - - numpy=1.17.* - - openpyxl=3.0.0 - - python-dateutil - - python-blosc - - pytz=2017.3 - - scipy - - sqlalchemy=1.3.0 - - xlrd=1.2.0 - - xlsxwriter=1.0.2 - - xlwt=1.3.0 - - html5lib=1.0.1 diff --git a/ci/deps/actions-37-minimum_versions.yaml b/ci/deps/actions-37-minimum_versions.yaml deleted file mode 100644 index b97601d18917c..0000000000000 --- a/ci/deps/actions-37-minimum_versions.yaml +++ /dev/null @@ -1,31 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.7.1 - - # tools - - cython=0.29.21 - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - psutil - - # pandas dependencies - - beautifulsoup4=4.6.0 - - bottleneck=1.2.1 - - jinja2=2.10 - - numba=0.46.0 - - numexpr=2.7.0 - - numpy=1.17.3 - - openpyxl=3.0.0 - - pytables=3.5.1 - - python-dateutil=2.7.3 - - pytz=2017.3 - - pyarrow=0.17.0 - - scipy=1.2 - - xlrd=1.2.0 - - xlsxwriter=1.0.2 - - xlwt=1.3.0 - - html5lib=1.0.1 diff --git a/ci/deps/actions-37-slow.yaml b/ci/deps/actions-37-slow.yaml deleted file mode 100644 index 166f2237dcad3..0000000000000 --- a/ci/deps/actions-37-slow.yaml +++ /dev/null @@ -1,39 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.7.* - - # tools - - cython>=0.29.21 - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - # pandas dependencies - - beautifulsoup4 - - fsspec>=0.7.4 - - html5lib - - lxml - - matplotlib - - numexpr - - numpy - - openpyxl - - patsy - - psycopg2 - - pymysql - - pytables - - python-dateutil - - pytz - - s3fs>=0.4.0 - - moto>=1.3.14 - - scipy - - sqlalchemy - - xlrd<2.0 - - xlsxwriter - - xlwt - - moto - - flask - - numba diff --git a/ci/deps/actions-37.yaml b/ci/deps/actions-37.yaml deleted file mode 100644 index 0effe6f80df86..0000000000000 --- a/ci/deps/actions-37.yaml +++ /dev/null @@ -1,28 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.7.* - - # tools - - cython>=0.29.21 - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - # pandas dependencies - - botocore>=1.11 - - fsspec>=0.7.4 - - numpy=1.19 - - python-dateutil - - nomkl - - pyarrow - - pytz - - s3fs>=0.4.0 - - moto>=1.3.14 - - flask - - tabulate - - pyreadstat - - pip diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml new file mode 100644 index 0000000000000..06ffafeb70570 --- /dev/null +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -0,0 +1,71 @@ +# Non-dependencies that pandas utilizes or has compatibility with pandas objects +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.8 + + # test dependencies + - cython>=0.29.32 + - pytest>=6.0 + - pytest-cov + - pytest-xdist>=1.31 + - psutil + - pytest-asyncio>=0.17 + - boto3 + + # required dependencies + - python-dateutil + - numpy + - pytz + + # optional dependencies + - beautifulsoup4 + - blosc + - brotlipy + - bottleneck + - fastparquet + - fsspec + - html5lib + - hypothesis + - gcsfs + - jinja2 + - lxml + - matplotlib>=3.6.1 + - numba + - numexpr + - openpyxl + - odfpy + - pandas-gbq + - psycopg2 + - pyarrow<10 + - pymysql + - pyreadstat + - pytables + - python-snappy + - pyxlsb + - s3fs>=2021.08.0 + - scipy + - sqlalchemy<1.4.46 + - tabulate + - xarray + - xlrd + - xlsxwriter + - xlwt + - zstandard + + # downstream packages + - aiobotocore + - botocore + - cftime + - dask + - ipython + - geopandas-base + - seaborn + - scikit-learn + - statsmodels + - coverage + - pandas-datareader + - pyyaml + - py + - pytorch diff --git a/ci/deps/actions-38-locale.yaml b/ci/deps/actions-38-locale.yaml deleted file mode 100644 index 34a6860936550..0000000000000 --- a/ci/deps/actions-38-locale.yaml +++ /dev/null @@ -1,41 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.8.* - - # tools - - cython>=0.29.21 - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.21 - - pytest-asyncio>=0.12.0 - - hypothesis>=3.58.0 - - # pandas dependencies - - beautifulsoup4 - - flask - - html5lib - - ipython - - jinja2 - - jedi<0.18.0 - - lxml - - matplotlib<3.3.0 - - moto - - nomkl - - numexpr - - numpy<1.20 # GH#39541 compat with pyarrow<3 - - openpyxl - - pytables - - python-dateutil - - pytz - - scipy - - xarray - - xlrd<2.0 - - xlsxwriter - - xlwt - - moto - - pyarrow=1.0.0 - - pip - - pip: - - pyxlsb diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml new file mode 100644 index 0000000000000..fd23080c2ab04 --- /dev/null +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -0,0 +1,57 @@ +# Minimum version of required + optional dependencies +# Aligned with getting_started/install.rst and compat/_optional.py +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.8.0 + + # test dependencies + - cython>=0.29.32 + - pytest>=6.0 + - pytest-cov + - pytest-xdist>=1.31 + - psutil + - pytest-asyncio>=0.17 + - boto3 + + # required dependencies + - python-dateutil=2.8.1 + - numpy=1.20.3 + - pytz=2020.1 + + # optional dependencies + - beautifulsoup4=4.9.3 + - blosc=1.21.0 + - bottleneck=1.3.2 + - brotlipy=0.7.0 + - fastparquet=0.4.0 + - fsspec=2021.07.0 + - html5lib=1.1 + - hypothesis=6.13.0 + - gcsfs=2021.07.0 + - jinja2=3.0.0 + - lxml=4.6.3 + - matplotlib=3.3.2 + - numba=0.53.1 + - numexpr=2.7.3 + - odfpy=1.4.1 + - openpyxl=3.0.7 + - pandas-gbq=0.15.0 + - psycopg2=2.8.6 + - pyarrow=1.0.1 + - pymysql=1.0.2 + - pyreadstat=1.1.2 + - pytables=3.6.1 + - python-snappy=0.6.0 + - pyxlsb=1.0.8 + - s3fs=2021.08.0 + - scipy=1.7.1 + - sqlalchemy=1.4.16 + - tabulate=0.8.9 + - tzdata=2022a + - xarray=0.19.0 + - xlrd=2.0.1 + - xlsxwriter=1.4.3 + - xlwt=1.3.0 + - zstandard=0.15.2 diff --git a/ci/deps/actions-38-slow.yaml b/ci/deps/actions-38-slow.yaml deleted file mode 100644 index afba60e451b90..0000000000000 --- a/ci/deps/actions-38-slow.yaml +++ /dev/null @@ -1,38 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.8.* - - # tools - - cython>=0.29.21 - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - # pandas dependencies - - beautifulsoup4 - - fsspec>=0.7.4 - - html5lib - - lxml - - matplotlib - - numexpr - - numpy - - openpyxl - - patsy - - psycopg2 - - pymysql - - pytables - - python-dateutil - - pytz - - s3fs>=0.4.0 - - moto>=1.3.14 - - scipy - - sqlalchemy - - xlrd>=2.0 - - xlsxwriter - - xlwt - - moto - - flask - - numba diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index 11daa92046eb4..222da40ea9eea 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -1,20 +1,54 @@ name: pandas-dev channels: - - defaults - conda-forge dependencies: - - python=3.8.* + - python=3.8 - # tools - - cython>=0.29.21 + # test dependencies + - cython>=0.29.32 - pytest>=6.0 - pytest-cov - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 + - pytest-xdist>=1.31 + - psutil + - pytest-asyncio>=0.17 + - boto3 - # pandas dependencies - - numpy + # required dependencies - python-dateutil - - nomkl + - numpy - pytz - - tabulate==0.8.7 + + # optional dependencies + - beautifulsoup4 + - blosc + - bottleneck + - brotlipy + - fastparquet + - fsspec + - html5lib + - hypothesis + - gcsfs + - jinja2 + - lxml + - matplotlib>=3.6.1 + - numba + - numexpr + - openpyxl + - odfpy + - pandas-gbq + - psycopg2 + - pyarrow<10 + - pymysql + - pyreadstat + - pytables + - python-snappy + - pyxlsb + - s3fs>=2021.08.0 + - scipy + - sqlalchemy<1.4.46 + - tabulate + - xarray + - xlrd + - xlsxwriter + - xlwt + - zstandard diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index b74f1af8ee0f6..1c60e8ad6d78a 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -2,21 +2,54 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.9.* + - python=3.9 - # tools - - cython>=0.29.21 + # test dependencies + - cython>=0.29.32 - pytest>=6.0 - pytest-cov - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 + - pytest-xdist>=1.31 + - psutil + - pytest-asyncio>=0.17 + - boto3 - # pandas dependencies - - numpy + # required dependencies - python-dateutil + - numpy - pytz # optional dependencies + - beautifulsoup4 + - blosc + - bottleneck + - brotlipy + - fastparquet + - fsspec + - html5lib + - hypothesis + - gcsfs + - jinja2 + - lxml + - matplotlib>=3.6.1 + - numba + - numexpr + - openpyxl + - odfpy + - pandas-gbq + - psycopg2 + - pymysql + - pyarrow<10 + - pyreadstat - pytables + - python-snappy + - pyxlsb + - s3fs>=2021.08.0 - scipy - - pyarrow=1.0 + - sqlalchemy<1.4.46 + - tabulate + - tzdata>=2022a + - xarray + - xlrd + - xlsxwriter + - xlwt + - zstandard diff --git a/ci/deps/actions-pypy-38.yaml b/ci/deps/actions-pypy-38.yaml new file mode 100644 index 0000000000000..e06b992acc191 --- /dev/null +++ b/ci/deps/actions-pypy-38.yaml @@ -0,0 +1,21 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + # TODO: Add the rest of the dependencies in here + # once the other plentiful failures/segfaults + # with base pandas has been dealt with + - python=3.8[build=*_pypy] # TODO: use this once pypy3.8 is available + + # tools + - cython>=0.29.32 + - pytest>=6.0 + - pytest-cov + - pytest-asyncio + - pytest-xdist>=1.31 + - hypothesis>=5.5.3 + + # required + - numpy + - python-dateutil + - pytz diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml deleted file mode 100644 index 43e1055347f17..0000000000000 --- a/ci/deps/azure-macos-37.yaml +++ /dev/null @@ -1,37 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.7.* - - # tools - - pytest>=6.0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - pytest-azurepipelines - - # pandas dependencies - - beautifulsoup4 - - bottleneck - - html5lib - - jinja2 - - lxml - - matplotlib=2.2.3 - - nomkl - - numexpr - - numpy=1.17.3 - - openpyxl - - pyarrow=0.17 - - pytables - - python-dateutil==2.7.3 - - pytz - - xarray - - xlrd<2.0 - - xlsxwriter - - xlwt - - pip - - pip: - - cython>=0.29.21 - - pyreadstat - - pyxlsb diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml deleted file mode 100644 index 5cbc029f8c03d..0000000000000 --- a/ci/deps/azure-windows-37.yaml +++ /dev/null @@ -1,42 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.7.* - - # tools - - cython>=0.29.21 - - pytest>=6.0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - pytest-azurepipelines - - # pandas dependencies - - beautifulsoup4 - - bottleneck - - fsspec>=0.8.0 - - gcsfs>=0.6.0 - - html5lib - - jinja2 - - lxml - - matplotlib=2.2.* - - moto>=1.3.14 - - flask - - numexpr - - numpy=1.17.* - - openpyxl - - pyarrow=0.17.0 - - pytables - - python-dateutil - - pytz - - s3fs>=0.4.2 - - scipy - - sqlalchemy - - xlrd>=2.0 - - xlsxwriter - - xlwt - - pyreadstat - - pip - - pip: - - pyxlsb diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml deleted file mode 100644 index 7fdecae626f9d..0000000000000 --- a/ci/deps/azure-windows-38.yaml +++ /dev/null @@ -1,36 +0,0 @@ -name: pandas-dev -channels: - - conda-forge - - defaults -dependencies: - - python=3.8.* - - # tools - - cython>=0.29.21 - - pytest>=6.0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - pytest-azurepipelines - - # pandas dependencies - - blosc - - bottleneck - - fastparquet>=0.4.0 - - flask - - fsspec>=0.8.0 - - matplotlib=3.1.3 - - moto>=1.3.14 - - numba - - numexpr - - numpy=1.18.* - - openpyxl - - jinja2 - - pyarrow>=0.17.0 - - pytables - - python-dateutil - - pytz - - s3fs>=0.4.0 - - scipy - - xlrd<2.0 - - xlsxwriter - - xlwt diff --git a/ci/deps/circle-37-arm64.yaml b/ci/deps/circle-37-arm64.yaml deleted file mode 100644 index 995ebda1f97e7..0000000000000 --- a/ci/deps/circle-37-arm64.yaml +++ /dev/null @@ -1,21 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.7.* - - # tools - - cython>=0.29.21 - - pytest>=6.0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - # pandas dependencies - - botocore>=1.11 - - numpy - - python-dateutil - - pytz - - pip - - flask - - pip: - - moto diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml new file mode 100644 index 0000000000000..263521fb74879 --- /dev/null +++ b/ci/deps/circle-38-arm64.yaml @@ -0,0 +1,55 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.8 + + # test dependencies + - cython>=0.29.32 + - pytest>=6.0 + - pytest-cov + - pytest-xdist>=1.31 + - psutil + - pytest-asyncio>=0.17 + - boto3 + + # required dependencies + - python-dateutil + - numpy + - pytz + + # optional dependencies + - beautifulsoup4 + - blosc + - bottleneck + - brotlipy + - fastparquet + - fsspec + - html5lib + - hypothesis + - gcsfs + - jinja2 + - lxml + - matplotlib>=3.6.1 + - numba + - numexpr + - openpyxl + - odfpy + - pandas-gbq + - psycopg2 + - pyarrow<10 + - pymysql + # Not provided on ARM + #- pyreadstat + - pytables + - python-snappy + - pyxlsb + - s3fs>=2021.08.0 + - scipy + - sqlalchemy<1.4.46 + - tabulate + - xarray + - xlrd + - xlsxwriter + - xlwt + - zstandard diff --git a/ci/print_skipped.py b/ci/print_skipped.py deleted file mode 100755 index 60e2f047235e6..0000000000000 --- a/ci/print_skipped.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 -import os -import xml.etree.ElementTree as et - - -def main(filename): - if not os.path.isfile(filename): - raise RuntimeError(f"Could not find junit file {repr(filename)}") - - tree = et.parse(filename) - root = tree.getroot() - current_class = "" - for el in root.iter("testcase"): - cn = el.attrib["classname"] - for sk in el.findall("skipped"): - old_class = current_class - current_class = cn - if old_class != current_class: - yield None - yield { - "class_name": current_class, - "test_name": el.attrib["name"], - "message": sk.attrib["message"], - } - - -if __name__ == "__main__": - print("SKIPPED TESTS:") - i = 1 - for test_data in main("test-data.xml"): - if test_data is None: - print("-" * 80) - else: - print( - f"#{i} {test_data['class_name']}." - f"{test_data['test_name']}: {test_data['message']}" - ) - i += 1 diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 0d6f26d8c29f8..e6de5caf955fc 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -5,12 +5,17 @@ # https://github.com/pytest-dev/pytest/issues/1075 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') +# May help reproduce flaky CI builds if set in subsequent runs +echo PYTHONHASHSEED=$PYTHONHASHSEED + if [[ "not network" == *"$PATTERN"* ]]; then export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; fi -if [ "$COVERAGE" ]; then +if [[ "$COVERAGE" == "true" ]]; then COVERAGE="-s --cov=pandas --cov-report=xml --cov-append" +else + COVERAGE="" # We need to reset this for COVERAGE="false" case fi # If no X server is found, we use xvfb to emulate it @@ -19,18 +24,26 @@ if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then XVFB="xvfb-run " fi -PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE pandas" +PYTEST_CMD="${XVFB}pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" -if [[ $(uname) != "Linux" && $(uname) != "Darwin" ]]; then - # GH#37455 windows py38 build appears to be running out of memory - # skip collection of window tests - PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/moments --ignore=pandas/tests/plotting/" +if [[ "$PATTERN" ]]; then + PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" fi echo $PYTEST_CMD sh -c "$PYTEST_CMD" -PYTEST_AM_CMD="PANDAS_DATA_MANAGER=array pytest -m \"$PATTERN and arraymanager\" -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE pandas" +if [[ "$PANDAS_DATA_MANAGER" != "array" && "$PYTEST_TARGET" == "pandas" ]]; then + # The ArrayManager tests should have already been run by PYTEST_CMD if PANDAS_DATA_MANAGER was already set to array + # If we're targeting specific files, e.g. test_downstream.py, don't run. + PYTEST_AM_CMD="PANDAS_DATA_MANAGER=array pytest -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE pandas" + + if [[ "$PATTERN" ]]; then + PYTEST_AM_CMD="$PYTEST_AM_CMD -m \"$PATTERN and arraymanager\"" + else + PYTEST_AM_CMD="$PYTEST_AM_CMD -m \"arraymanager\"" + fi -echo $PYTEST_AM_CMD -sh -c "$PYTEST_AM_CMD" + echo $PYTEST_AM_CMD + sh -c "$PYTEST_AM_CMD" +fi diff --git a/ci/setup_env.sh b/ci/setup_env.sh deleted file mode 100755 index 2e16bc6545161..0000000000000 --- a/ci/setup_env.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/bash -e - -# edit the locale file if needed -if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then - echo "Adding locale to the first line of pandas/__init__.py" - rm -f pandas/__init__.pyc - SEDC="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LC_ALL')\n" - sed -i "$SEDC" pandas/__init__.py - - echo "[head -4 pandas/__init__.py]" - head -4 pandas/__init__.py - echo -fi - - -echo "Install Miniconda" -DEFAULT_CONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest" -if [[ "$(uname -m)" == 'aarch64' ]]; then - CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.10.1-4/Miniforge3-4.10.1-4-Linux-aarch64.sh" -elif [[ "$(uname)" == 'Linux' ]]; then - if [[ "$BITS32" == "yes" ]]; then - CONDA_URL="$DEFAULT_CONDA_URL-Linux-x86.sh" - else - CONDA_URL="$DEFAULT_CONDA_URL-Linux-x86_64.sh" - fi -elif [[ "$(uname)" == 'Darwin' ]]; then - CONDA_URL="$DEFAULT_CONDA_URL-MacOSX-x86_64.sh" -else - echo "OS $(uname) not supported" - exit 1 -fi -echo "Downloading $CONDA_URL" -wget -q $CONDA_URL -O miniconda.sh -chmod +x miniconda.sh - -MINICONDA_DIR="$HOME/miniconda3" -rm -rf $MINICONDA_DIR -./miniconda.sh -b -p $MINICONDA_DIR -export PATH=$MINICONDA_DIR/bin:$PATH - -echo -echo "which conda" -which conda - -echo -echo "update conda" -conda config --set ssl_verify false -conda config --set quiet true --set always_yes true --set changeps1 false -conda install pip conda # create conda to create a historical artifact for pip & setuptools -conda update -n base conda - -echo "conda info -a" -conda info -a - -echo "source deactivate" -source deactivate - -echo "conda list (root environment)" -conda list - -# Clean up any left-over from a previous build -conda remove --all -q -y -n pandas-dev - -echo -echo "conda env create -q --file=${ENV_FILE}" -time conda env create -q --file="${ENV_FILE}" - - -if [[ "$BITS32" == "yes" ]]; then - # activate 32-bit compiler - export CONDA_BUILD=1 -fi - -echo "activate pandas-dev" -source activate pandas-dev - -# Explicitly set an environment variable indicating that this is pandas' CI environment. -# -# This allows us to enable things like -Werror that shouldn't be activated in -# downstream CI jobs that may also build pandas from source. -export PANDAS_CI=1 - -echo -echo "remove any installed pandas package" -echo "w/o removing anything else" -conda remove pandas -y --force || true -pip uninstall -y pandas || true - -echo -echo "remove postgres if has been installed with conda" -echo "we use the one from the CI" -conda remove postgresql -y --force || true - -echo -echo "remove qt" -echo "causes problems with the clipboard, we use xsel for that" -conda remove qt -y --force || true - -echo -echo "conda list pandas" -conda list pandas - -# Make sure any error below is reported as such - -echo "[Build extensions]" -python setup.py build_ext -q -j2 - -echo "[Updating pip]" -python -m pip install --no-deps -U pip wheel setuptools - -echo "[Install pandas]" -python -m pip install --no-build-isolation -e . - -echo -echo "conda list" -conda list - -# Install DB for Linux - -if [[ -n ${SQL:0} ]]; then - echo "installing dbs" - mysql -e 'create database pandas_nosetest;' - psql -c 'create database pandas_nosetest;' -U postgres -else - echo "not using dbs on non-linux Travis builds or Azure Pipelines" -fi -echo "done" diff --git a/codecov.yml b/codecov.yml index 893e40db004a6..d893bdbdc9298 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,5 +1,5 @@ codecov: - branch: master + branch: main notify: after_n_builds: 10 comment: false @@ -12,6 +12,7 @@ coverage: patch: default: target: '50' + informational: true github_checks: annotations: false diff --git a/doc/_templates/pandas_footer.html b/doc/_templates/pandas_footer.html new file mode 100644 index 0000000000000..6d8caa4d6c741 --- /dev/null +++ b/doc/_templates/pandas_footer.html @@ -0,0 +1,3 @@ + diff --git a/doc/_templates/sidebar-nav-bs.html b/doc/_templates/sidebar-nav-bs.html index 7e0043e771e72..8298b66568e20 100644 --- a/doc/_templates/sidebar-nav-bs.html +++ b/doc/_templates/sidebar-nav-bs.html @@ -1,9 +1,9 @@ diff --git a/doc/data/fx_prices b/doc/data/fx_prices deleted file mode 100644 index 38cadf26909a3..0000000000000 Binary files a/doc/data/fx_prices and /dev/null differ diff --git a/doc/data/mindex_ex.csv b/doc/data/mindex_ex.csv deleted file mode 100644 index 935ff936cd842..0000000000000 --- a/doc/data/mindex_ex.csv +++ /dev/null @@ -1,16 +0,0 @@ -year,indiv,zit,xit -1977,"A",1.2,.6 -1977,"B",1.5,.5 -1977,"C",1.7,.8 -1978,"A",.2,.06 -1978,"B",.7,.2 -1978,"C",.8,.3 -1978,"D",.9,.5 -1978,"E",1.4,.9 -1979,"C",.2,.15 -1979,"D",.14,.05 -1979,"E",.5,.15 -1979,"F",1.2,.5 -1979,"G",3.4,1.9 -1979,"H",5.4,2.7 -1979,"I",6.4,1.2 diff --git a/doc/data/test.xls b/doc/data/test.xls deleted file mode 100644 index db0f9dec7d5e4..0000000000000 Binary files a/doc/data/test.xls and /dev/null differ diff --git a/doc/make.py b/doc/make.py index 5d2476fcdca8d..c758c7fc84bbb 100755 --- a/doc/make.py +++ b/doc/make.py @@ -45,7 +45,7 @@ def __init__( single_doc=None, verbosity=0, warnings_are_errors=False, - ): + ) -> None: self.num_jobs = num_jobs self.include_api = include_api self.whatsnew = whatsnew diff --git a/doc/redirects.csv b/doc/redirects.csv index 9b8a5a73dedff..fda09d7644a49 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -45,6 +45,7 @@ contributing_docstring,development/contributing_docstring developer,development/developer extending,development/extending internals,development/internals +development/meeting,community # api moved function reference/api/pandas.io.json.json_normalize,pandas.json_normalize diff --git a/doc/source/_static/css/getting_started.css b/doc/source/_static/css/getting_started.css index 84eafa308175c..2a348e5b84e6e 100644 --- a/doc/source/_static/css/getting_started.css +++ b/doc/source/_static/css/getting_started.css @@ -154,7 +154,7 @@ ul.task-bullet > li > p:first-child { .comparison-card .card-footer { border: none; - background-color:white; + background-color: transparent; } .install-block { @@ -163,19 +163,18 @@ ul.task-bullet > li > p:first-child { .install-card .card-header { border: none; - background-color:white; + background-color: transparent; padding: 1rem 1rem 0rem 1rem; } .install-card .card-header p.card-text { - color: #150458; font-size: 1.1rem; font-weight: bold; } .install-card .card-footer { border: none; - background-color:white; + background-color: transparent; } .install-card pre { diff --git a/doc/source/_static/css/pandas.css b/doc/source/_static/css/pandas.css index 452c7d20ff5df..a08be3301edda 100644 --- a/doc/source/_static/css/pandas.css +++ b/doc/source/_static/css/pandas.css @@ -5,6 +5,10 @@ --pst-color-info: 23, 162, 184; } +table { + width: auto; /* Override fit-content which breaks Styler user guide ipynb */ +} + /* Main index page overview cards */ .intro-card { @@ -25,7 +29,7 @@ .intro-card .card-header { border: none; - background-color:white; + background-color: transparent; color: #150458 !important; font-size: var(--pst-font-size-h5); font-weight: bold; @@ -34,7 +38,7 @@ .intro-card .card-footer { border: none; - background-color:white; + background-color: transparent; } .intro-card .card-footer p.card-text{ @@ -42,3 +46,7 @@ margin-left: auto; margin-right: auto; } + +.card, .card img { + background-color: transparent !important; +} diff --git a/doc/source/_static/index_api.svg b/doc/source/_static/index_api.svg index 70bf0d3504b1a..69f7ba1d2d114 100644 --- a/doc/source/_static/index_api.svg +++ b/doc/source/_static/index_api.svg @@ -64,29 +64,29 @@ inkscape:connector-curvature="0" id="path899" d="M 324.96812,187.09499 H 303.0455 v 72.1639 h 22.67969" - style="fill:none;stroke:#150458;stroke-width:10;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> + style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> + style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> + style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> + style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> diff --git a/doc/source/_static/index_getting_started.svg b/doc/source/_static/index_getting_started.svg index d00e462427193..2d36622cb7e55 100644 --- a/doc/source/_static/index_getting_started.svg +++ b/doc/source/_static/index_getting_started.svg @@ -58,7 +58,7 @@ id="layer1" transform="translate(2.9219487,-8.5995374)"> diff --git a/doc/source/_static/index_user_guide.svg b/doc/source/_static/index_user_guide.svg index a567103af5918..bd170535170a3 100644 --- a/doc/source/_static/index_user_guide.svg +++ b/doc/source/_static/index_user_guide.svg @@ -58,7 +58,7 @@ id="layer1" transform="translate(141.8903,-20.32143)"> + + + + + + + + \ No newline at end of file diff --git a/doc/source/_static/logo_sql.svg b/doc/source/_static/logo_sql.svg index 4a5b7d0b1b943..38b3b2c726214 100644 --- a/doc/source/_static/logo_sql.svg +++ b/doc/source/_static/logo_sql.svg @@ -58,10 +58,10 @@ d="m 18.846017,1.608 c -0.497,-0.326 -1.193,-0.615 -2.069,-0.858 -1.742,-0.484 -4.05,-0.75 -6.498,-0.75 -2.4480004,0 -4.7560004,0.267 -6.4980004,0.75 -0.877,0.243 -1.573,0.532 -2.069,0.858 -0.619,0.407 -0.93299996,0.874 -0.93299996,1.391 v 12 c 0,0.517 0.31399996,0.985 0.93299996,1.391 0.497,0.326 1.193,0.615 2.069,0.858 1.742,0.484 4.05,0.75 6.4980004,0.75 2.448,0 4.756,-0.267 6.498,-0.751 0.877,-0.243 1.573,-0.532 2.069,-0.858 0.619,-0.406 0.933,-0.874 0.933,-1.391 v -12 c 0,-0.517 -0.314,-0.985 -0.933,-1.391 z M 4.0490166,1.713 c 1.658,-0.46 3.87,-0.714 6.2300004,-0.714 2.36,0 4.573,0.254 6.23,0.714 1.795,0.499 2.27,1.059 2.27,1.286 0,0.227 -0.474,0.787 -2.27,1.286 -1.658,0.46 -3.87,0.714 -6.23,0.714 -2.3600004,0 -4.5730004,-0.254 -6.2300004,-0.714 -1.795,-0.499 -2.27,-1.059 -2.27,-1.286 0,-0.227 0.474,-0.787 2.27,-1.286 z M 16.509017,16.285 c -1.658,0.46 -3.87,0.714 -6.23,0.714 -2.3600004,0 -4.5730004,-0.254 -6.2300004,-0.714 -1.795,-0.499 -2.27,-1.059 -2.27,-1.286 v -2.566 c 0.492,0.309 1.164,0.583 2.002,0.816 1.742,0.484 4.05,0.75 6.4980004,0.75 2.448,0 4.756,-0.267 6.498,-0.751 0.838,-0.233 1.511,-0.507 2.002,-0.816 v 2.566 c 0,0.227 -0.474,0.787 -2.27,1.286 z m 0,-4 c -1.658,0.46 -3.87,0.714 -6.23,0.714 -2.3600004,0 -4.5730004,-0.254 -6.2300004,-0.714 -1.795,-0.499 -2.27,-1.059 -2.27,-1.286 V 8.433 c 0.492,0.309 1.164,0.583 2.002,0.816 1.742,0.484 4.05,0.75 6.4980004,0.75 2.448,0 4.756,-0.267 6.498,-0.75 0.838,-0.233 1.511,-0.507 2.002,-0.816 v 2.566 c 0,0.227 -0.474,0.787 -2.27,1.286 z m 0,-4 c -1.658,0.46 -3.87,0.714 -6.23,0.714 -2.3600004,0 -4.5730004,-0.254 -6.2300004,-0.714 -1.795,-0.499 -2.27,-1.059 -2.27,-1.286 V 4.433 c 0.492,0.309 1.164,0.583 2.002,0.816 1.742,0.484 4.05,0.75 6.4980004,0.75 2.448,0 4.756,-0.267 6.498,-0.75 0.838,-0.233 1.511,-0.507 2.002,-0.816 v 2.566 c 0,0.227 -0.474,0.787 -2.27,1.286 z" id="path2" inkscape:connector-curvature="0" - style="fill:#000000" /> + style="fill:#888888" /> `_ -standard and uses `Black `_ -and `Flake8 `_ to ensure a -consistent code format throughout the project. We encourage you to use -:ref:`pre-commit ` to automatically run ``black``, -``flake8``, ``isort``, and related code checks when you make a git commit. - -Patterns -======== - -We use a ``flake8`` plugin, `pandas-dev-flaker `_, to -check our codebase for unwanted patterns. See its ``README`` for the up-to-date list of rules we enforce. - -Testing -======= - -Failing tests --------------- - -See https://docs.pytest.org/en/latest/skipping.html for background. - -Do not use ``pytest.xfail`` ---------------------------- - -Do not use this method. It has the same behavior as ``pytest.skip``, namely -it immediately stops the test and does not check if the test will fail. If -this is the behavior you desire, use ``pytest.skip`` instead. - -Using ``pytest.mark.xfail`` ---------------------------- - -Use this method if a test is known to fail but the manner in which it fails -is not meant to be captured. It is common to use this method for a test that -exhibits buggy behavior or a non-implemented feature. If -the failing test has flaky behavior, use the argument ``strict=False``. This -will make it so pytest does not fail if the test happens to pass. - -Prefer the decorator ``@pytest.mark.xfail`` and the argument ``pytest.param`` -over usage within a test so that the test is appropriately marked during the -collection phase of pytest. For xfailing a test that involves multiple -parameters, a fixture, or a combination of these, it is only possible to -xfail during the testing phase. To do so, use the ``request`` fixture: - -.. code-block:: python - - import pytest - - def test_xfail(request): - mark = pytest.mark.xfail(raises=TypeError, reason="Indicate why here") - request.node.add_marker(mark) - -xfail is not to be used for tests involving failure due to invalid user arguments. -For these tests, we need to verify the correct exception type and error message -is being raised, using ``pytest.raises`` instead. - -Miscellaneous -============= - -Reading from a url ------------------- - -**Good:** - -.. code-block:: python - - from pandas.io.common import urlopen - - with urlopen("http://www.google.com/") as url: - raw_text = url.read() diff --git a/doc/source/development/community.rst b/doc/source/development/community.rst new file mode 100644 index 0000000000000..59689a2cf51d1 --- /dev/null +++ b/doc/source/development/community.rst @@ -0,0 +1,119 @@ +.. _community: + +===================== +Contributor community +===================== + +pandas is a community-driven open source project developed by a large group +of `contributors `_ +and a smaller group of `maintainers `_. +The pandas leadership has made a strong commitment to creating an open, +inclusive, and positive community. Please read the pandas `Code of Conduct +`_ for guidance on how to +interact with others in a way that makes the community thrive. + +We offer several meetings and communication channels to share knowledge and +connect with others within the pandas community. + +Community meeting +----------------- + +The pandas Community Meeting is a regular sync meeting for the project's +maintainers which is open to the community. Everyone is welcome to attend and +contribute to conversations. + +The meetings take place on the second Wednesday of each month at 18:00 UTC. + +The minutes of past meetings are available in `this Google Document `__. + + +New contributor meeting +----------------------- + +On the third Wednesday of the month, we hold meetings to welcome and support +new contributors in our community. + +| 👋 you all are invited +| 💬 everyone can present (add yourself to the hackMD agenda) +| 👀 anyone can sit in and listen + +Attendees are new and experienced contributors, as well as a few maintainers. +We aim to answer questions about getting started, or help with work in +progress when possible, as well as get to know each other and share our +learnings and experiences. + +The agenda for the next meeting and minutes of past meetings are available in +`this HackMD `__. + +Calendar +-------- + +This calendar shows all the community meetings. Our community meetings are +ideal for anyone wanting to contribute to pandas, or just curious to know how +current development is going. + +.. raw:: html + + + +You can subscribe to this calendar with the following links: + +* `iCal `__ +* `Google calendar `__ + +Additionally, we'll sometimes have one-off meetings on specific topics. +These will be published on the same calendar. + +`GitHub issue tracker `_ +---------------------------------------------------------------------- + +The pandas contributor community conducts conversations mainly via this channel. +Any community member can open issues to: + +- Report bugs, e.g. "I noticed the behavior of a certain function is + incorrect" +- Request features, e.g. "I would like this error message to be more readable" +- Request documentation improvements, e.g. "I found this section unclear" +- Ask questions, e.g. "I noticed the behavior of a certain function + changed between versions. Is this expected?". + + Ideally your questions should be related to how pandas work rather + than how you use pandas. `StackOverflow `_ is + better suited for answering usage questions, and we ask that all usage + questions are first asked on StackOverflow. Thank you for respecting are + time and wishes. 🙇 + +Maintainers and frequent contributors might also open issues to discuss the +ongoing development of the project. For example: + +- Report issues with the CI, GitHub Actions, or the performance of pandas +- Open issues relating to the internals +- Start roadmap discussion aligning on proposals what to do in future + releases or changes to the API. +- Open issues relating to the project's website, logo, or governance + +The developer mailing list +-------------------------- + +The pandas mailing list `pandas-dev@python.org `_ is used for long form +conversations and to engages people in the wider community who might not +be active on the issue tracker but we would like to include in discussions. + +.. _community.slack: + +Community slack +--------------- + +We have a chat platform for contributors, maintainers and potential +contributors. This is not a space for user questions, rather for questions about +contributing to pandas. The slack is a private space, specifically meant for +people who are hesitant to bring up their questions or ideas on a large public +mailing list or GitHub. + +If this sounds like the right place for you, you are welcome to join! Email us +at `slack@pandas.pydata.org `_ and let us +know that you read and agree to our `Code of Conduct `_ +😉 to get an invite. And please remember the slack is not meant to replace the +mailing list or issue tracker - all important announcements and conversations +should still happen there. diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index f4a09e0daa750..faa3d29a628f9 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -45,8 +45,13 @@ assigned issues, since people may not be working in them anymore. If you want to that is assigned, feel free to kindly ask the current assignee if you can take it (please allow at least a week of inactivity before considering work in the issue discontinued). -Feel free to ask questions on the `mailing list -`_ or on `Gitter`_. +We have several :ref:`contributor community ` communication channels, which you are +welcome to join, and ask questions as you figure things out. Among them are regular meetings for +new contributors, dev meetings, a dev mailing list, and a slack for the contributor community. +All pandas contributors are welcome to these spaces, where they can connect with each other. Even +maintainers who have been with us for a long time felt just like you when they started out, and +are happy to welcome you and support you as you get to know how we work, and where things are. +Take a look at the next sections to learn more. .. _contributing.bug_reports: @@ -59,7 +64,7 @@ will allow others to reproduce the bug and provide insight into fixing. See `this blogpost `_ for tips on writing a good bug report. -Trying the bug-producing code out on the *master* branch is often a worthwhile exercise +Trying the bug-producing code out on the *main* branch is often a worthwhile exercise to confirm the bug still exists. It is also worth searching existing bug reports and pull requests to see if the issue has already been reported and/or fixed. @@ -143,7 +148,7 @@ as the version number cannot be computed anymore. Creating a branch ----------------- -You want your master branch to reflect only production-ready code, so create a +You want your main branch to reflect only production-ready code, so create a feature branch for making your changes. For example:: git branch shiny-new-feature @@ -158,14 +163,14 @@ changes in this branch specific to one bug or feature so it is clear what the branch brings to pandas. You can have many shiny-new-features and switch in between them using the git checkout command. -When creating this branch, make sure your master branch is up to date with -the latest upstream master version. To update your local master branch, you +When creating this branch, make sure your main branch is up to date with +the latest upstream main version. To update your local main branch, you can do:: - git checkout master - git pull upstream master --ff-only + git checkout main + git pull upstream main --ff-only -When you want to update the feature branch with changes in master after +When you want to update the feature branch with changes in main after you created the branch, check the section on :ref:`updating a PR `. @@ -256,7 +261,7 @@ double check your branch changes against the branch it was based on: #. Navigate to your repository on GitHub -- https://github.com/your-user-name/pandas #. Click on ``Branches`` #. Click on the ``Compare`` button for your feature branch -#. Select the ``base`` and ``compare`` branches, if necessary. This will be ``master`` and +#. Select the ``base`` and ``compare`` branches, if necessary. This will be ``main`` and ``shiny-new-feature``, respectively. Finally, make the pull request @@ -264,8 +269,8 @@ Finally, make the pull request If everything looks good, you are ready to make a pull request. A pull request is how code from a local repository becomes available to the GitHub community and can be looked -at and eventually merged into the master version. This pull request and its associated -changes will eventually be committed to the master branch and available in the next +at and eventually merged into the main version. This pull request and its associated +changes will eventually be committed to the main branch and available in the next release. To submit a pull request: #. Navigate to your repository on GitHub @@ -294,14 +299,14 @@ This will automatically update your pull request with the latest code and restar :any:`Continuous Integration ` tests. Another reason you might need to update your pull request is to solve conflicts -with changes that have been merged into the master branch since you opened your +with changes that have been merged into the main branch since you opened your pull request. -To do this, you need to "merge upstream master" in your branch:: +To do this, you need to "merge upstream main" in your branch:: git checkout shiny-new-feature git fetch upstream - git merge upstream/master + git merge upstream/main If there are no conflicts (or they could be fixed automatically), a file with a default commit message will open, and you can simply save and quit this file. @@ -313,7 +318,7 @@ Once the conflicts are merged and the files where the conflicts were solved are added, you can run ``git commit`` to save those fixes. If you have uncommitted changes at the moment you want to update the branch with -master, you will need to ``stash`` them prior to updating (see the +main, you will need to ``stash`` them prior to updating (see the `stash docs `__). This will effectively store your changes and they can be reapplied after updating. @@ -326,23 +331,22 @@ Autofixing formatting errors ---------------------------- We use several styling checks (e.g. ``black``, ``flake8``, ``isort``) which are run after -you make a pull request. If there is a scenario where any of these checks fail then you -can comment:: +you make a pull request. - @github-actions pre-commit - -on that pull request. This will trigger a workflow which will autofix formatting errors. +To automatically fix formatting errors on each commit you make, you can +set up pre-commit yourself. First, create a Python :ref:`environment +` and then set up :ref:`pre-commit `. Delete your merged branch (optional) ------------------------------------ Once your feature branch is accepted into upstream, you'll probably want to get rid of -the branch. First, merge upstream master into your branch so git knows it is safe to +the branch. First, merge upstream main into your branch so git knows it is safe to delete your branch:: git fetch upstream - git checkout master - git merge upstream/master + git checkout main + git merge upstream/main Then you can do:: @@ -355,8 +359,6 @@ The branch will still exist on GitHub, so to delete it there do:: git push origin --delete shiny-new-feature -.. _Gitter: https://gitter.im/pydata/pandas - Tips for a successful pull request ================================== diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index e812aaa760a8f..26692057f3e23 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -23,31 +23,30 @@ contributing them to the project:: ./ci/code_checks.sh -The script verifies the linting of code files, it looks for common mistake patterns -(like missing spaces around sphinx directives that make the documentation not -being rendered properly) and it also validates the doctests. It is possible to -run the checks independently by using the parameters ``lint``, ``patterns`` and -``doctests`` (e.g. ``./ci/code_checks.sh lint``). +The script validates the doctests, formatting in docstrings, and +imported modules. It is possible to run the checks independently by using the +parameters ``docstring``, ``code``, and ``doctests`` +(e.g. ``./ci/code_checks.sh doctests``). In addition, because a lot of people use our library, it is important that we do not make sudden changes to the code that could have the potential to break a lot of user code as a result, that is, we need it to be as *backwards compatible* as possible to avoid mass breakages. -In addition to ``./ci/code_checks.sh``, some extra checks are run by -``pre-commit`` - see :ref:`here ` for how to -run them. - -Additional standards are outlined on the :ref:`pandas code style guide `. +In addition to ``./ci/code_checks.sh``, some extra checks (including static type +checking) are run by ``pre-commit`` - see :ref:`here ` +for how to run them. .. _contributing.pre-commit: Pre-commit ---------- -You can run many of these styling checks manually as we have described above. However, -we encourage you to use `pre-commit hooks `_ instead -to automatically run ``black``, ``flake8``, ``isort`` when you make a git commit. This +Additionally, :ref:`Continuous Integration ` will run code formatting checks +like ``black``, ``flake8`` (including a `pandas-dev-flaker `_ plugin), +``isort``, and ``cpplint`` and more using `pre-commit hooks `_ +Any warnings from these checks will cause the :ref:`Continuous Integration ` to fail; therefore, +it is helpful to run the check yourself before submitting code. This can be done by installing ``pre-commit``:: pip install pre-commit @@ -70,12 +69,17 @@ to run its checks with:: without needing to have done ``pre-commit install`` beforehand. -If you want to run checks on all recently committed files on upstream/master you can use:: +If you want to run checks on all recently committed files on upstream/main you can use:: - pre-commit run --from-ref=upstream/master --to-ref=HEAD --all-files + pre-commit run --from-ref=upstream/main --to-ref=HEAD --all-files without needing to have done ``pre-commit install`` beforehand. +.. note:: + + You may want to periodically run ``pre-commit gc``, to clean up repos + which are no longer used. + .. note:: If you have conflicting installations of ``virtualenv``, then you may get an @@ -100,157 +104,8 @@ All optional dependencies should be documented in :ref:`install.optional_dependencies` and the minimum required version should be set in the ``pandas.compat._optional.VERSIONS`` dict. -C (cpplint) -~~~~~~~~~~~ - -pandas uses the `Google `_ -standard. Google provides an open source style checker called ``cpplint``, but we -use a fork of it that can be found `here `__. -Here are *some* of the more common ``cpplint`` issues: - -* we restrict line-length to 80 characters to promote readability -* every header file must include a header guard to avoid name collisions if re-included - -:ref:`Continuous Integration ` will run the -`cpplint `_ tool -and report any stylistic errors in your code. Therefore, it is helpful before -submitting code to run the check yourself:: - - cpplint --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir modified-c-file - -You can also run this command on an entire directory if necessary:: - - cpplint --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive modified-c-directory - -To make your commits compliant with this standard, you can install the -`ClangFormat `_ tool, which can be -downloaded `here `__. To configure, in your home directory, -run the following command:: - - clang-format style=google -dump-config > .clang-format - -Then modify the file to ensure that any indentation width parameters are at least four. -Once configured, you can run the tool as follows:: - - clang-format modified-c-file - -This will output what your file will look like if the changes are made, and to apply -them, run the following command:: - - clang-format -i modified-c-file - -To run the tool on an entire directory, you can run the following analogous commands:: - - clang-format modified-c-directory/*.c modified-c-directory/*.h - clang-format -i modified-c-directory/*.c modified-c-directory/*.h - -Do note that this tool is best-effort, meaning that it will try to correct as -many errors as possible, but it may not correct *all* of them. Thus, it is -recommended that you run ``cpplint`` to double check and make any other style -fixes manually. - -.. _contributing.code-formatting: - -Python (PEP8 / black) -~~~~~~~~~~~~~~~~~~~~~ - -pandas follows the `PEP8 `_ standard -and uses `Black `_ and -`Flake8 `_ to ensure a consistent code -format throughout the project. We encourage you to use :ref:`pre-commit `. - -:ref:`Continuous Integration ` will run those tools and -report any stylistic errors in your code. Therefore, it is helpful before -submitting code to run the check yourself:: - - black pandas - git diff upstream/master -u -- "*.py" | flake8 --diff - -to auto-format your code. Additionally, many editors have plugins that will -apply ``black`` as you edit files. - -You should use a ``black`` version 21.5b2 as previous versions are not compatible -with the pandas codebase. - -One caveat about ``git diff upstream/master -u -- "*.py" | flake8 --diff``: this -command will catch any stylistic errors in your changes specifically, but -be beware it may not catch all of them. For example, if you delete the only -usage of an imported function, it is stylistically incorrect to import an -unused function. However, style-checking the diff will not catch this because -the actual import is not part of the diff. Thus, for completeness, you should -run this command, though it may take longer:: - - git diff upstream/master --name-only -- "*.py" | xargs -r flake8 - -Note that on OSX, the ``-r`` flag is not available, so you have to omit it and -run this slightly modified command:: - - git diff upstream/master --name-only -- "*.py" | xargs flake8 - -Windows does not support the ``xargs`` command (unless installed for example -via the `MinGW `__ toolchain), but one can imitate the -behaviour as follows:: - - for /f %i in ('git diff upstream/master --name-only -- "*.py"') do flake8 %i - -This will get all the files being changed by the PR (and ending with ``.py``), -and run ``flake8`` on them, one after the other. - -Note that these commands can be run analogously with ``black``. - -.. _contributing.import-formatting: - -Import formatting -~~~~~~~~~~~~~~~~~ -pandas uses `isort `__ to standardise import -formatting across the codebase. - -A guide to import layout as per pep8 can be found `here `__. - -A summary of our current import sections ( in order ): - -* Future -* Python Standard Library -* Third Party -* ``pandas._libs``, ``pandas.compat``, ``pandas.util._*``, ``pandas.errors`` (largely not dependent on ``pandas.core``) -* ``pandas.core.dtypes`` (largely not dependent on the rest of ``pandas.core``) -* Rest of ``pandas.core.*`` -* Non-core ``pandas.io``, ``pandas.plotting``, ``pandas.tseries`` -* Local application/library specific imports - -Imports are alphabetically sorted within these sections. - -As part of :ref:`Continuous Integration ` checks we run:: - - isort --check-only pandas - -to check that imports are correctly formatted as per the ``setup.cfg``. - -If you see output like the below in :ref:`Continuous Integration ` checks: - -.. code-block:: shell - - Check import format using isort - ERROR: /home/travis/build/pandas-dev/pandas/pandas/io/pytables.py Imports are incorrectly sorted - Check import format using isort DONE - The command "ci/code_checks.sh" exited with 1 - -You should run:: - - isort pandas/io/pytables.py - -to automatically format imports correctly. This will modify your local copy of the files. - -Alternatively, you can run a command similar to what was suggested for ``black`` and ``flake8`` :ref:`right above `:: - - git diff upstream/master --name-only -- "*.py" | xargs -r isort - -Where similar caveats apply if you are on OSX or Windows. - -You can then verify the changes look ok, then git :any:`commit ` and :any:`push `. - Backwards compatibility -~~~~~~~~~~~~~~~~~~~~~~~ +----------------------- Please try to maintain backward compatibility. pandas has lots of users with lots of existing code, so don't break it if at all possible. If you think breakage is required, @@ -272,6 +127,7 @@ Otherwise, you need to do it manually: .. code-block:: python import warnings + from pandas.util._exceptions import find_stack_level def old_func(): @@ -280,7 +136,11 @@ Otherwise, you need to do it manually: .. deprecated:: 1.1.0 Use new_func instead. """ - warnings.warn('Use new_func instead.', FutureWarning, stacklevel=2) + warnings.warn( + 'Use new_func instead.', + FutureWarning, + stacklevel=find_stack_level(), + ) new_func() @@ -304,7 +164,7 @@ pandas strongly encourages the use of :pep:`484` style type hints. New developme Style guidelines ~~~~~~~~~~~~~~~~ -Types imports should follow the ``from typing import ...`` convention. So rather than +Type imports should follow the ``from typing import ...`` convention. Some types do not need to be imported since :pep:`585` some builtin constructs, such as ``list`` and ``tuple``, can directly be used for type annotations. So rather than .. code-block:: python @@ -316,21 +176,31 @@ You should write .. code-block:: python - from typing import List, Optional, Union + primes: list[int] = [] + +``Optional`` should be avoided in favor of the shorter ``| None``, so instead of + +.. code-block:: python + + from typing import Union - primes: List[int] = [] + maybe_primes: list[Union[int, None]] = [] -``Optional`` should be used where applicable, so instead of +or .. code-block:: python - maybe_primes: List[Union[int, None]] = [] + from typing import Optional + + maybe_primes: list[Optional[int]] = [] You should write .. code-block:: python - maybe_primes: List[Optional[int]] = [] + from __future__ import annotations # noqa: F404 + + maybe_primes: list[int | None] = [] In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 `_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like @@ -362,7 +232,7 @@ In some cases you may be tempted to use ``cast`` from the typing module when you ... else: # Reasonably only str objects would reach this but... obj = cast(str, obj) # Mypy complains without this! - return obj.upper() + return obj.upper() The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 `_. While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable @@ -380,7 +250,7 @@ With custom types and inference this is not always possible so exceptions are ma pandas-specific types ~~~~~~~~~~~~~~~~~~~~~ -Commonly used types specific to pandas will appear in `pandas._typing `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. +Commonly used types specific to pandas will appear in `pandas._typing `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. For example, quite a few functions in pandas accept a ``dtype`` argument. This can be expressed as a string like ``"object"``, a ``numpy.dtype`` like ``np.int64`` or even a pandas ``ExtensionDtype`` like ``pd.CategoricalDtype``. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module @@ -396,24 +266,48 @@ This module will ultimately house types for repeatedly used concepts like "path- Validating type hints ~~~~~~~~~~~~~~~~~~~~~ -pandas uses `mypy `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running +pandas uses `mypy `_ and `pyright `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running .. code-block:: shell - mypy pandas + # the following might fail if the installed pandas version does not correspond to your local git version + pre-commit run --hook-stage manual --all-files + + # if the above fails due to stubtest + SKIP=stubtest pre-commit run --hook-stage manual --all-files + +in your activated python environment. A recent version of ``numpy`` (>=1.22.0) is required for type validation. .. _contributing.ci: +Testing type hints in code using pandas +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + * Pandas is not yet a py.typed library (:pep:`561`)! + The primary purpose of locally declaring pandas as a py.typed library is to test and + improve the pandas-builtin type annotations. + +Until pandas becomes a py.typed library, it is possible to easily experiment with the type +annotations shipped with pandas by creating an empty file named "py.typed" in the pandas +installation folder: + +.. code-block:: none + + python -c "import pandas; import pathlib; (pathlib.Path(pandas.__path__[0]) / 'py.typed').touch()" + +The existence of the py.typed file signals to type checkers that pandas is already a py.typed +library. This makes type checkers aware of the type annotations shipped with pandas. + Testing with continuous integration ----------------------------------- -The pandas test suite will run automatically on `GitHub Actions `__ and -`Azure Pipelines `__ +The pandas test suite will run automatically on `GitHub Actions `__ continuous integration services, once your pull request is submitted. However, if you wish to run the test suite on a branch prior to submitting the pull request, then the continuous integration services need to be hooked to your GitHub repository. Instructions are here -for `GitHub Actions `__ and -`Azure Pipelines `__. +for `GitHub Actions `__. A pull-request will be considered for merging when you have an all 'green' build. If any tests are failing, then you will get a red 'X', where you can click through to see the individual failed tests. @@ -424,8 +318,8 @@ This is an example of a green build. .. _contributing.tdd: -Test-driven development/code writing ------------------------------------- +Test-driven development +----------------------- pandas is serious about testing and strongly encourages contributors to embrace `test-driven development (TDD) `_. @@ -439,61 +333,190 @@ use cases and writing corresponding tests. Adding tests is one of the most common requests after code is pushed to pandas. Therefore, it is worth getting in the habit of writing tests ahead of time so this is never an issue. -Like many packages, pandas uses `pytest -`_ and the convenient -extensions in `numpy.testing -`_. - -.. note:: - - The earliest supported pytest version is 5.0.1. - Writing tests ~~~~~~~~~~~~~ All tests should go into the ``tests`` subdirectory of the specific package. This folder contains many current examples of tests, and we suggest looking to these for -inspiration. If your test requires working with files or -network connectivity, there is more information on the `testing page -`_ of the wiki. +inspiration. Ideally, there should be one, and only one, obvious place for a test to reside. +Until we reach that ideal, these are some rules of thumb for where a test should +be located. + +1. Does your test depend only on code in ``pd._libs.tslibs``? + This test likely belongs in one of: + + - tests.tslibs + + .. note:: + + No file in ``tests.tslibs`` should import from any pandas modules + outside of ``pd._libs.tslibs`` + + - tests.scalar + - tests.tseries.offsets + +2. Does your test depend only on code in pd._libs? + This test likely belongs in one of: + + - tests.libs + - tests.groupby.test_libgroupby + +3. Is your test for an arithmetic or comparison method? + This test likely belongs in one of: + + - tests.arithmetic + + .. note:: + + These are intended for tests that can be shared to test the behavior + of DataFrame/Series/Index/ExtensionArray using the ``box_with_array`` + fixture. + + - tests.frame.test_arithmetic + - tests.series.test_arithmetic + +4. Is your test for a reduction method (min, max, sum, prod, ...)? + This test likely belongs in one of: + + - tests.reductions + + .. note:: + + These are intended for tests that can be shared to test the behavior + of DataFrame/Series/Index/ExtensionArray. + + - tests.frame.test_reductions + - tests.series.test_reductions + - tests.test_nanops + +5. Is your test for an indexing method? + This is the most difficult case for deciding where a test belongs, because + there are many of these tests, and many of them test more than one method + (e.g. both ``Series.__getitem__`` and ``Series.loc.__getitem__``) + + A) Is the test specifically testing an Index method (e.g. ``Index.get_loc``, + ``Index.get_indexer``)? + This test likely belongs in one of: + + - tests.indexes.test_indexing + - tests.indexes.fooindex.test_indexing + + Within that files there should be a method-specific test class e.g. + ``TestGetLoc``. + + In most cases, neither ``Series`` nor ``DataFrame`` objects should be + needed in these tests. + + B) Is the test for a Series or DataFrame indexing method *other* than + ``__getitem__`` or ``__setitem__``, e.g. ``xs``, ``where``, ``take``, + ``mask``, ``lookup``, or ``insert``? + This test likely belongs in one of: + + - tests.frame.indexing.test_methodname + - tests.series.indexing.test_methodname + + C) Is the test for any of ``loc``, ``iloc``, ``at``, or ``iat``? + This test likely belongs in one of: + + - tests.indexing.test_loc + - tests.indexing.test_iloc + - tests.indexing.test_at + - tests.indexing.test_iat + + Within the appropriate file, test classes correspond to either types of + indexers (e.g. ``TestLocBooleanMask``) or major use cases + (e.g. ``TestLocSetitemWithExpansion``). + + See the note in section D) about tests that test multiple indexing methods. + + D) Is the test for ``Series.__getitem__``, ``Series.__setitem__``, + ``DataFrame.__getitem__``, or ``DataFrame.__setitem__``? + This test likely belongs in one of: + + - tests.series.test_getitem + - tests.series.test_setitem + - tests.frame.test_getitem + - tests.frame.test_setitem -The ``pandas._testing`` module has many special ``assert`` functions that -make it easier to make statements about whether Series or DataFrame objects are -equivalent. The easiest way to verify that your code is correct is to -explicitly construct the result you expect, then compare the actual result to -the expected correct result:: + If many cases such a test may test multiple similar methods, e.g. - def test_pivot(self): - data = { - 'index' : ['A', 'B', 'C', 'C', 'B', 'A'], - 'columns' : ['One', 'One', 'One', 'Two', 'Two', 'Two'], - 'values' : [1., 2., 3., 3., 2., 1.] - } + .. code-block:: python - frame = DataFrame(data) - pivoted = frame.pivot(index='index', columns='columns', values='values') + import pandas as pd + import pandas._testing as tm - expected = DataFrame({ - 'One' : {'A' : 1., 'B' : 2., 'C' : 3.}, - 'Two' : {'A' : 1., 'B' : 2., 'C' : 3.} - }) + def test_getitem_listlike_of_ints(): + ser = pd.Series(range(5)) - assert_frame_equal(pivoted, expected) + result = ser[[3, 4]] + expected = pd.Series([2, 3]) + tm.assert_series_equal(result, expected) -Please remember to add the Github Issue Number as a comment to a new test. -E.g. "# brief comment, see GH#28907" + result = ser.loc[[3, 4]] + tm.assert_series_equal(result, expected) -Transitioning to ``pytest`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ + In cases like this, the test location should be based on the *underlying* + method being tested. Or in the case of a test for a bugfix, the location + of the actual bug. So in this example, we know that ``Series.__getitem__`` + calls ``Series.loc.__getitem__``, so this is *really* a test for + ``loc.__getitem__``. So this test belongs in ``tests.indexing.test_loc``. + +6. Is your test for a DataFrame or Series method? + + A) Is the method a plotting method? + This test likely belongs in one of: + + - tests.plotting + + B) Is the method an IO method? + This test likely belongs in one of: + + - tests.io + + C) Otherwise + This test likely belongs in one of: + + - tests.series.methods.test_mymethod + - tests.frame.methods.test_mymethod + + .. note:: + + If a test can be shared between DataFrame/Series using the + ``frame_or_series`` fixture, by convention it goes in the + ``tests.frame`` file. + +7. Is your test for an Index method, not depending on Series/DataFrame? + This test likely belongs in one of: + + - tests.indexes + +8) Is your test for one of the pandas-provided ExtensionArrays (``Categorical``, + ``DatetimeArray``, ``TimedeltaArray``, ``PeriodArray``, ``IntervalArray``, + ``PandasArray``, ``FloatArray``, ``BoolArray``, ``StringArray``)? + This test likely belongs in one of: + + - tests.arrays + +9) Is your test for *all* ExtensionArray subclasses (the "EA Interface")? + This test likely belongs in one of: + + - tests.extension + +Using ``pytest`` +~~~~~~~~~~~~~~~~ + +Test structure +^^^^^^^^^^^^^^ pandas existing test structure is *mostly* class-based, meaning that you will typically find tests wrapped in a class. .. code-block:: python - class TestReallyCoolFeature: - pass + class TestReallyCoolFeature: + def test_cool_feature_aspect(self): + pass -Going forward, we are moving to a more *functional* style using the `pytest `__ framework, which offers a richer testing +We prefer a more *functional* style using the `pytest `__ framework, which offers a richer testing framework that will facilitate testing and developing. Thus, instead of writing test classes, we will write test functions like this: .. code-block:: python @@ -501,21 +524,135 @@ framework that will facilitate testing and developing. Thus, instead of writing def test_really_cool_feature(): pass -Using ``pytest`` -~~~~~~~~~~~~~~~~ +Preferred ``pytest`` idioms +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Functional tests named ``def test_*`` and *only* take arguments that are either fixtures or parameters. +* Use a bare ``assert`` for testing scalars and truth-testing +* Use ``tm.assert_series_equal(result, expected)`` and ``tm.assert_frame_equal(result, expected)`` for comparing :class:`Series` and :class:`DataFrame` results respectively. +* Use `@pytest.mark.parameterize `__ when testing multiple cases. +* Use `pytest.mark.xfail `__ when a test case is expected to fail. +* Use `pytest.mark.skip `__ when a test case is never expected to pass. +* Use `pytest.param `__ when a test case needs a particular mark. +* Use `@pytest.fixture `__ if multiple tests can share a setup object. + +.. warning:: -Here is an example of a self-contained set of tests that illustrate multiple features that we like to use. + Do not use ``pytest.xfail`` (which is different than ``pytest.mark.xfail``) since it immediately stops the + test and does not check if the test will fail. If this is the behavior you desire, use ``pytest.skip`` instead. -* functional style: tests are like ``test_*`` and *only* take arguments that are either fixtures or parameters -* ``pytest.mark`` can be used to set metadata on test functions, e.g. ``skip`` or ``xfail``. -* using ``parametrize``: allow testing of multiple cases -* to set a mark on a parameter, ``pytest.param(..., marks=...)`` syntax should be used -* ``fixture``, code for object construction, on a per-test basis -* using bare ``assert`` for scalars and truth-testing -* ``tm.assert_series_equal`` (and its counter part ``tm.assert_frame_equal``), for pandas object comparisons. -* the typical pattern of constructing an ``expected`` and comparing versus the ``result`` +If a test is known to fail but the manner in which it fails +is not meant to be captured, use ``pytest.mark.xfail`` It is common to use this method for a test that +exhibits buggy behavior or a non-implemented feature. If +the failing test has flaky behavior, use the argument ``strict=False``. This +will make it so pytest does not fail if the test happens to pass. -We would name this file ``test_cool_feature.py`` and put in an appropriate place in the ``pandas/tests/`` structure. +Prefer the decorator ``@pytest.mark.xfail`` and the argument ``pytest.param`` +over usage within a test so that the test is appropriately marked during the +collection phase of pytest. For xfailing a test that involves multiple +parameters, a fixture, or a combination of these, it is only possible to +xfail during the testing phase. To do so, use the ``request`` fixture: + +.. code-block:: python + + def test_xfail(request): + mark = pytest.mark.xfail(raises=TypeError, reason="Indicate why here") + request.node.add_marker(mark) + +xfail is not to be used for tests involving failure due to invalid user arguments. +For these tests, we need to verify the correct exception type and error message +is being raised, using ``pytest.raises`` instead. + +.. _contributing.warnings: + +Testing a warning +^^^^^^^^^^^^^^^^^ + +Use ``tm.assert_produces_warning`` as a context manager to check that a block of code raises a warning. + +.. code-block:: python + + with tm.assert_produces_warning(DeprecationWarning): + pd.deprecated_function() + +If a warning should specifically not happen in a block of code, pass ``False`` into the context manager. + +.. code-block:: python + + with tm.assert_produces_warning(False): + pd.no_warning_function() + +If you have a test that would emit a warning, but you aren't actually testing the +warning itself (say because it's going to be removed in the future, or because we're +matching a 3rd-party library's behavior), then use ``pytest.mark.filterwarnings`` to +ignore the error. + +.. code-block:: python + + @pytest.mark.filterwarnings("ignore:msg:category") + def test_thing(self): + pass + +If you need finer-grained control, you can use Python's +`warnings module `__ +to control whether a warning is ignored or raised at different places within +a single test. + +.. code-block:: python + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + +Testing an exception +^^^^^^^^^^^^^^^^^^^^ + +Use `pytest.raises `_ as a context manager +with the specific exception subclass (i.e. never use :py:class:`Exception`) and the exception message in ``match``. + +.. code-block:: python + + with pytest.raises(ValueError, match="an error"): + raise ValueError("an error") + +Testing involving files +^^^^^^^^^^^^^^^^^^^^^^^ + +The ``tm.ensure_clean`` context manager creates a temporary file for testing, +with a generated filename (or your filename if provided), that is automatically +deleted when the context block is exited. + +.. code-block:: python + + with tm.ensure_clean('my_file_path') as path: + # do something with the path + +Testing involving network connectivity +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It is highly discouraged to add a test that connects to the internet due to flakiness of network connections and +lack of ownership of the server that is being connected to. If network connectivity is absolutely required, use the +``tm.network`` decorator. + +.. code-block:: python + + @tm.network # noqa + def test_network(): + result = package.call_to_internet() + +If the test requires data from a specific website, specify ``check_before_test=True`` and the site in the decorator. + +.. code-block:: python + + @tm.network("https://www.somespecificsite.com/", check_before_test=True) + def test_network(): + result = pd.read_html("https://www.somespecificsite.com/") + +Example +^^^^^^^ + +Here is an example of a self-contained set of tests in a file ``pandas/tests/test_cool_feature.py`` +that illustrate multiple features that we like to use. Please remember to add the Github Issue Number +as a comment to a new test. .. code-block:: python @@ -548,6 +685,7 @@ We would name this file ``test_cool_feature.py`` and put in an appropriate place def test_series(series, dtype): + # GH result = series.astype(dtype) assert result.dtype == dtype @@ -595,7 +733,7 @@ Tests that we have ``parametrized`` are now accessible via the test name, for ex Using ``hypothesis`` ~~~~~~~~~~~~~~~~~~~~ -Hypothesis is a library for property-based testing. Instead of explicitly +Hypothesis is a library for property-based testing. Instead of explicitly parametrizing a test, you can describe *all* valid inputs and let Hypothesis try to find a failing input. Even better, no matter how many random examples it tries, Hypothesis always reports a single minimal counterexample to your @@ -630,59 +768,6 @@ preferred if the inputs or logic are simple, with Hypothesis tests reserved for cases with complex logic or where there are too many combinations of options or subtle interactions to test (or think of!) all of them. -.. _contributing.warnings: - -Testing warnings -~~~~~~~~~~~~~~~~ - -By default, one of pandas CI workers will fail if any unhandled warnings are emitted. - -If your change involves checking that a warning is actually emitted, use -``tm.assert_produces_warning(ExpectedWarning)``. - - -.. code-block:: python - - import pandas._testing as tm - - - df = pd.DataFrame() - with tm.assert_produces_warning(FutureWarning): - df.some_operation() - -We prefer this to the ``pytest.warns`` context manager because ours checks that the warning's -stacklevel is set correctly. The stacklevel is what ensure the *user's* file name and line number -is printed in the warning, rather than something internal to pandas. It represents the number of -function calls from user code (e.g. ``df.some_operation()``) to the function that actually emits -the warning. Our linter will fail the build if you use ``pytest.warns`` in a test. - -If you have a test that would emit a warning, but you aren't actually testing the -warning itself (say because it's going to be removed in the future, or because we're -matching a 3rd-party library's behavior), then use ``pytest.mark.filterwarnings`` to -ignore the error. - -.. code-block:: python - - @pytest.mark.filterwarnings("ignore:msg:category") - def test_thing(self): - ... - -If the test generates a warning of class ``category`` whose message starts -with ``msg``, the warning will be ignored and the test will pass. - -If you need finer-grained control, you can use Python's usual -`warnings module `__ -to control whether a warning is ignored / raised at different places within -a single test. - -.. code-block:: python - - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - # Or use warnings.filterwarnings(...) - -Alternatively, consider breaking up the unit test. - Running the test suite ---------------------- @@ -692,8 +777,7 @@ install pandas) by typing:: pytest pandas -The tests suite is exhaustive and takes around 20 minutes to run. Often it is -worth running only a subset of tests first around your changes before running the +Often it is worth running only a subset of tests first around your changes before running the entire suite. The easiest way to do this is with:: @@ -741,10 +825,10 @@ Running the performance test suite Performance matters and it is worth considering whether your code has introduced performance regressions. pandas is in the process of migrating to -`asv benchmarks `__ +`asv benchmarks `__ to enable easy monitoring of the performance of critical pandas operations. These benchmarks are all found in the ``pandas/asv_bench`` directory, and the -test results can be found `here `__. +test results can be found `here `__. To use all features of asv, you will need either ``conda`` or ``virtualenv``. For more details please check the `asv installation @@ -752,18 +836,18 @@ webpage `_. To install asv:: - pip install git+https://github.com/spacetelescope/asv + pip install git+https://github.com/airspeed-velocity/asv If you need to run a benchmark, change your directory to ``asv_bench/`` and run:: - asv continuous -f 1.1 upstream/master HEAD + asv continuous -f 1.1 upstream/main HEAD You can replace ``HEAD`` with the name of the branch you are working on, and report benchmarks that changed by more than 10%. The command uses ``conda`` by default for creating the benchmark environments. If you want to use virtualenv instead, write:: - asv continuous -f 1.1 -E virtualenv upstream/master HEAD + asv continuous -f 1.1 -E virtualenv upstream/main HEAD The ``-E virtualenv`` option should be added to all ``asv`` commands that run benchmarks. The default value is defined in ``asv.conf.json``. @@ -775,12 +859,12 @@ do not cause unexpected performance regressions. You can run specific benchmark using the ``-b`` flag, which takes a regular expression. For example, this will only run benchmarks from a ``pandas/asv_bench/benchmarks/groupby.py`` file:: - asv continuous -f 1.1 upstream/master HEAD -b ^groupby + asv continuous -f 1.1 upstream/main HEAD -b ^groupby If you want to only run a specific group of benchmarks from a file, you can do it using ``.`` as a separator. For example:: - asv continuous -f 1.1 upstream/master HEAD -b groupby.GroupByMethods + asv continuous -f 1.1 upstream/main HEAD -b groupby.GroupByMethods will only run the ``GroupByMethods`` benchmark defined in ``groupby.py``. @@ -812,7 +896,21 @@ Changes should be reflected in the release notes located in ``doc/source/whatsne This file contains an ongoing change log for each release. Add an entry to this file to document your fix, enhancement or (unavoidable) breaking change. Make sure to include the GitHub issue number when adding your entry (using ``:issue:`1234``` where ``1234`` is the -issue/pull request number). +issue/pull request number). Your entry should be written using full sentences and proper +grammar. + +When mentioning parts of the API, use a Sphinx ``:func:``, ``:meth:``, or ``:class:`` +directive as appropriate. Not all public API functions and methods have a +documentation page; ideally links would only be added if they resolve. You can +usually find similar examples by checking the release notes for one of the previous +versions. + +If your code is a bugfix, add your entry to the relevant bugfix section. Avoid +adding to the ``Other`` section; only in rare cases should entries go there. +Being as concise as possible, the description of the bug should include how the +user may encounter it and an indication of the bug itself, e.g. +"produces incorrect results" or "incorrectly raises". It may be necessary to also +indicate the new behavior. If your code is an enhancement, it is most likely necessary to add usage examples to the existing documentation. This can be done following the section diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 623d1e8d45565..a87d8d5ad44bf 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -68,7 +68,7 @@ explained in this document: * `numpydoc docstring guide `_ (which is based in the original `Guide to NumPy/SciPy documentation - `_) + `_) numpydoc is a Sphinx extension to support the NumPy docstring convention. diff --git a/doc/source/development/contributing_documentation.rst b/doc/source/development/contributing_documentation.rst index a4a4f781d9dad..fac6a91ce82f2 100644 --- a/doc/source/development/contributing_documentation.rst +++ b/doc/source/development/contributing_documentation.rst @@ -12,7 +12,11 @@ you don't have to be an expert on pandas to do so! In fact, there are sections of the docs that are worse off after being written by experts. If something in the docs doesn't make sense to you, updating the relevant section after you figure it out is a great way to ensure it will help -the next person. +the next person. Please visit the `issues page `__ +for a full list of issues that are currently open regarding the +Pandas documentation. + + .. contents:: Documentation: :local: @@ -89,16 +93,6 @@ Some other important things to know about the docs: ``doc/source/reference``, else Sphinx will emit a warning. -.. note:: - - The ``.rst`` files are used to automatically generate Markdown and HTML versions - of the docs. For this reason, please do not edit ``CONTRIBUTING.md`` directly, - but instead make any changes to ``doc/source/development/contributing.rst``. Then, to - generate ``CONTRIBUTING.md``, use `pandoc `_ - with the following command:: - - pandoc doc/source/development/contributing.rst -t markdown_github > CONTRIBUTING.md - The utility script ``scripts/validate_docstrings.py`` can be used to get a csv summary of the API documentation. And also validate common errors in the docstring of a specific class, function or method. The summary also compares the list of @@ -202,10 +196,10 @@ And you'll have the satisfaction of seeing your new and improved documentation! .. _contributing.dev_docs: -Building master branch documentation +Building main branch documentation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When pull requests are merged into the pandas ``master`` branch, the main parts of +When pull requests are merged into the pandas ``main`` branch, the main parts of the documentation are also built by Travis-CI. These docs are then hosted `here `__, see also the :any:`Continuous Integration ` section. diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index bc0a3556b9ac1..942edd863a19a 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -10,104 +10,46 @@ To test out code changes, you'll need to build pandas from source, which requires a C/C++ compiler and Python environment. If you're making documentation changes, you can skip to :ref:`contributing to the documentation ` but if you skip creating the development environment you won't be able to build the documentation -locally before pushing your changes. +locally before pushing your changes. It's recommended to also install the :ref:`pre-commit hooks `. .. contents:: Table of contents: :local: +Step 1: install a C compiler +---------------------------- -Creating an environment using Docker --------------------------------------- - -Instead of manually setting up a development environment, you can use `Docker -`_ to automatically create the environment with just several -commands. pandas provides a ``DockerFile`` in the root directory to build a Docker image -with a full pandas development environment. - -**Docker Commands** - -Pass your GitHub username in the ``DockerFile`` to use your own fork:: - - # Build the image pandas-yourname-env - docker build --tag pandas-yourname-env . - # Run a container and bind your local forked repo, pandas-yourname, to the container - docker run -it --rm -v path-to-pandas-yourname:/home/pandas-yourname pandas-yourname-env - -Even easier, you can integrate Docker with the following IDEs: - -**Visual Studio Code** - -You can use the DockerFile to launch a remote session with Visual Studio Code, -a popular free IDE, using the ``.devcontainer.json`` file. -See https://code.visualstudio.com/docs/remote/containers for details. - -**PyCharm (Professional)** - -Enable Docker support and use the Services tool window to build and manage images as well as -run and interact with containers. -See https://www.jetbrains.com/help/pycharm/docker.html for details. - -Note that you might need to rebuild the C extensions if/when you merge with upstream/master using:: - - python setup.py build_ext -j 4 - - -Creating an environment without Docker ---------------------------------------- - -Installing a C compiler -~~~~~~~~~~~~~~~~~~~~~~~ - -pandas uses C extensions (mostly written using Cython) to speed up certain -operations. To install pandas from source, you need to compile these C -extensions, which means you need a C compiler. This process depends on which -platform you're using. - -If you have setup your environment using ``conda``, the packages ``c-compiler`` -and ``cxx-compiler`` will install a fitting compiler for your platform that is -compatible with the remaining conda packages. On Windows and macOS, you will -also need to install the SDKs as they have to be distributed separately. -These packages will automatically be installed by using the ``pandas`` -``environment.yml`` file. +How to do this will depend on your platform. If you choose to user ``Docker`` +in the next step, then you can skip this step. **Windows** -You will need `Build Tools for Visual Studio 2017 -`_. - -.. warning:: - You DO NOT need to install Visual Studio 2019. - You only need "Build Tools for Visual Studio 2019" found by - scrolling down to "All downloads" -> "Tools for Visual Studio 2019". - In the installer, select the "C++ build tools" workload. +You will need `Build Tools for Visual Studio 2022 +`_. -You can install the necessary components on the commandline using -`vs_buildtools.exe `_: +.. note:: + You DO NOT need to install Visual Studio 2022. + You only need "Build Tools for Visual Studio 2022" found by + scrolling down to "All downloads" -> "Tools for Visual Studio". + In the installer, select the "Desktop development with C++" Workloads. -.. code:: +Alternatively, you can install the necessary components on the commandline using +`vs_BuildTools.exe `_ - vs_buildtools.exe --quiet --wait --norestart --nocache ^ - --installPath C:\BuildTools ^ - --add "Microsoft.VisualStudio.Workload.VCTools;includeRecommended" ^ - --add Microsoft.VisualStudio.Component.VC.v141 ^ - --add Microsoft.VisualStudio.Component.VC.v141.x86.x64 ^ - --add Microsoft.VisualStudio.Component.Windows10SDK.17763 - -To setup the right paths on the commandline, call -``"C:\BuildTools\VC\Auxiliary\Build\vcvars64.bat" -vcvars_ver=14.16 10.0.17763.0``. +Alternatively, you could use the `WSL `_ +and consult the ``Linux`` instructions below. **macOS** -To use the ``conda``-based compilers, you will need to install the +To use the :ref:`mamba `-based compilers, you will need to install the Developer Tools using ``xcode-select --install``. Otherwise information about compiler installation can be found here: https://devguide.python.org/setup/#macos **Linux** -For Linux-based ``conda`` installations, you won't have to install any -additional components outside of the conda environment. The instructions -below are only needed if your setup isn't based on conda environments. +For Linux-based :ref:`mamba ` installations, you won't have to install any +additional components outside of the mamba environment. The instructions +below are only needed if your setup isn't based on mamba environments. Some Linux distributions will come with a pre-installed C compiler. To find out which compilers (and versions) are installed on your system:: @@ -119,81 +61,42 @@ which compilers (and versions) are installed on your system:: `GCC (GNU Compiler Collection) `_, is a widely used compiler, which supports C and a number of other languages. If GCC is listed -as an installed compiler nothing more is required. If no C compiler is -installed (or you wish to install a newer version) you can install a compiler -(GCC in the example code below) with:: - - # for recent Debian/Ubuntu: - sudo apt install build-essential - # for Red Had/RHEL/CentOS/Fedora - yum groupinstall "Development Tools" - -For other Linux distributions, consult your favorite search engine for -compiler installation instructions. +as an installed compiler nothing more is required. -Let us know if you have any difficulties by opening an issue or reaching out on `Gitter `_. +If no C compiler is installed, or you wish to upgrade, or you're using a different +Linux distribution, consult your favorite search engine for compiler installation/update +instructions. +Let us know if you have any difficulties by opening an issue or reaching out on our contributor +community :ref:`Slack `. -Creating a Python environment -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Step 2: create an isolated environment +---------------------------------------- -Now create an isolated pandas development environment: +Before we begin, please: -* Install either `Anaconda `_, `miniconda - `_, or `miniforge `_ -* Make sure your conda is up to date (``conda update conda``) * Make sure that you have :any:`cloned the repository ` * ``cd`` to the pandas source directory -We'll now kick off a three-step process: +.. _contributing.mamba: -1. Install the build dependencies -2. Build and install pandas -3. Install the optional dependencies +Option 1: using mamba (recommended) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* Install `mamba `_ +* Make sure your mamba is up to date (``mamba update mamba``) .. code-block:: none # Create and activate the build environment - conda env create -f environment.yml - conda activate pandas-dev - - # or with older versions of Anaconda: - source activate pandas-dev - - # Build and install pandas - python setup.py build_ext -j 4 - python -m pip install -e . --no-build-isolation --no-use-pep517 - -At this point you should be able to import pandas from your locally built version:: - - $ python # start an interpreter - >>> import pandas - >>> print(pandas.__version__) - 0.22.0.dev0+29.g4ad6d4d74 - -This will create the new environment, and not touch any of your existing environments, -nor any existing Python installation. - -To view your environments:: - - conda info -e - -To return to your root environment:: + mamba env create --file environment.yml + mamba activate pandas-dev - conda deactivate +Option 2: using pip +~~~~~~~~~~~~~~~~~~~ -See the full conda docs `here `__. - - -Creating a Python environment (pip) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If you aren't using conda for your development environment, follow these instructions. -You'll need to have at least the :ref:`minimum Python version ` that pandas supports. If your Python version -is 3.8.0 (or later), you might need to update your ``setuptools`` to version 42.0.0 (or later) -in your development environment before installing the build dependencies:: - - pip install --upgrade setuptools +You'll need to have at least the :ref:`minimum Python version ` that pandas supports. +You also need to have ``setuptools`` 51.0.0 or later to build pandas. **Unix**/**macOS with virtualenv** @@ -210,10 +113,6 @@ in your development environment before installing the build dependencies:: # Install the build dependencies python -m pip install -r requirements-dev.txt - # Build and install pandas - python setup.py build_ext -j 4 - python -m pip install -e . --no-build-isolation --no-use-pep517 - **Unix**/**macOS with pyenv** Consult the docs for setting up pyenv `here `__. @@ -222,11 +121,10 @@ Consult the docs for setting up pyenv `here `__. # Create a virtual environment # Use an ENV_DIR of your choice. We'll use ~/Users//.pyenv/versions/pandas-dev - pyenv virtualenv # For instance: - pyenv virtualenv 3.7.6 pandas-dev + pyenv virtualenv 3.9.10 pandas-dev # Activate the virtualenv pyenv activate pandas-dev @@ -234,19 +132,15 @@ Consult the docs for setting up pyenv `here `__. # Now install the build dependencies in the cloned pandas repo python -m pip install -r requirements-dev.txt - # Build and install pandas - python setup.py build_ext -j 4 - python -m pip install -e . --no-build-isolation --no-use-pep517 - **Windows** Below is a brief overview on how to set-up a virtual environment with Powershell under Windows. For details please refer to the -`official virtualenv user guide `__ +`official virtualenv user guide `__. -Use an ENV_DIR of your choice. We'll use ~\\virtualenvs\\pandas-dev where -'~' is the folder pointed to by either $env:USERPROFILE (Powershell) or -%USERPROFILE% (cmd.exe) environment variable. Any parent directories +Use an ENV_DIR of your choice. We'll use ``~\\virtualenvs\\pandas-dev`` where +``~`` is the folder pointed to by either ``$env:USERPROFILE`` (Powershell) or +``%USERPROFILE%`` (cmd.exe) environment variable. Any parent directories should already exist. .. code-block:: powershell @@ -260,6 +154,59 @@ should already exist. # Install the build dependencies python -m pip install -r requirements-dev.txt +Option 3: using Docker +~~~~~~~~~~~~~~~~~~~~~~ + +pandas provides a ``DockerFile`` in the root directory to build a Docker image +with a full pandas development environment. + +**Docker Commands** + +Build the Docker image:: + + # Build the image + docker build -t pandas-dev . + +Run Container:: + + # Run a container and bind your local repo to the container + # This command assumes you are running from your local repo + # but if not alter ${PWD} to match your local repo path + docker run -it --rm -v ${PWD}:/home/pandas pandas-dev + +*Even easier, you can integrate Docker with the following IDEs:* + +**Visual Studio Code** + +You can use the DockerFile to launch a remote session with Visual Studio Code, +a popular free IDE, using the ``.devcontainer.json`` file. +See https://code.visualstudio.com/docs/remote/containers for details. + +**PyCharm (Professional)** + +Enable Docker support and use the Services tool window to build and manage images as well as +run and interact with containers. +See https://www.jetbrains.com/help/pycharm/docker.html for details. + +Step 3: build and install pandas +-------------------------------- + +You can now run:: + # Build and install pandas python setup.py build_ext -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 + +At this point you should be able to import pandas from your locally built version:: + + $ python + >>> import pandas + >>> print(pandas.__version__) # note: the exact output may differ + 2.0.0.dev0+880.g2b9e661fbb.dirty + +This will create the new environment, and not touch any of your existing environments, +nor any existing Python installation. + +.. note:: + You will need to repeat this step each time the C extensions change, for example + if you modified any file in ``pandas/_libs`` or if you did a fetch and merge from ``upstream/main``. diff --git a/doc/source/development/debugging_extensions.rst b/doc/source/development/debugging_extensions.rst index 894277d304020..7ba2091e18853 100644 --- a/doc/source/development/debugging_extensions.rst +++ b/doc/source/development/debugging_extensions.rst @@ -80,7 +80,7 @@ Once the process launches, simply type ``run`` and the test suite will begin, st Checking memory leaks with valgrind =================================== -You can use `Valgrind `_ to check for and log memory leaks in extensions. For instance, to check for a memory leak in a test from the suite you can run: +You can use `Valgrind `_ to check for and log memory leaks in extensions. For instance, to check for a memory leak in a test from the suite you can run: .. code-block:: sh diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index d701208792a4c..6de237b70f08d 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -180,7 +180,7 @@ As an example of fully-formed metadata: 'numpy_type': 'int64', 'metadata': None} ], - 'pandas_version': '0.20.0', + 'pandas_version': '1.4.0', 'creator': { 'library': 'pyarrow', 'version': '0.13.0' diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index d5b45f5953453..c7286616672b9 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -50,7 +50,7 @@ decorate a class, providing the name of attribute to add. The class's Now users can access your methods using the ``geo`` namespace: - >>> ds = pd.Dataframe( + >>> ds = pd.DataFrame( ... {"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)} ... ) >>> ds.geo.center @@ -74,10 +74,11 @@ applies only to certain dtypes. Extension types --------------- -.. warning:: +.. note:: - The :class:`pandas.api.extensions.ExtensionDtype` and :class:`pandas.api.extensions.ExtensionArray` APIs are new and - experimental. They may change between versions without warning. + The :class:`pandas.api.extensions.ExtensionDtype` and :class:`pandas.api.extensions.ExtensionArray` APIs were + experimental prior to pandas 1.5. Starting with version 1.5, future changes will follow + the :ref:`pandas deprecation policy `. pandas defines an interface for implementing data types and arrays that *extend* NumPy's type system. pandas itself uses the extension system for some types @@ -106,7 +107,7 @@ extension array for IP Address data, this might be ``ipaddress.IPv4Address``. See the `extension dtype source`_ for interface definition. -:class:`pandas.api.extension.ExtensionDtype` can be registered to pandas to allow creation via a string dtype name. +:class:`pandas.api.extensions.ExtensionDtype` can be registered to pandas to allow creation via a string dtype name. This allows one to instantiate ``Series`` and ``.astype()`` with a registered string name, for example ``'category'`` is a registered string accessor for the ``CategoricalDtype``. @@ -125,7 +126,7 @@ data. We do require that your array be convertible to a NumPy array, even if this is relatively expensive (as it is for ``Categorical``). They may be backed by none, one, or many NumPy arrays. For example, -``pandas.Categorical`` is an extension array backed by two arrays, +:class:`pandas.Categorical` is an extension array backed by two arrays, one for codes and one for categories. An array of IPv6 addresses may be backed by a NumPy structured array with two fields, one for the lower 64 bits and one for the upper 64 bits. Or they may be backed @@ -231,7 +232,7 @@ Testing extension arrays We provide a test suite for ensuring that your extension arrays satisfy the expected behavior. To use the test suite, you must provide several pytest fixtures and inherit from the base test class. The required fixtures are found in -https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/conftest.py. +https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/conftest.py. To use a test, subclass it: @@ -244,7 +245,7 @@ To use a test, subclass it: pass -See https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/base/__init__.py +See https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/base/__init__.py for a list of all the tests available. .. _extending.extension.arrow: @@ -290,9 +291,9 @@ See more in the `Arrow documentation `__ +Libraries implementing the plotting backend should use `entry points `__ to make their backend discoverable to pandas. The key is ``"pandas_plotting_backends"``. For example, pandas registers the default "matplotlib" backend as follows. @@ -486,4 +487,4 @@ registers the default "matplotlib" backend as follows. More information on how to implement a third-party plotting backend can be found at -https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1. +https://github.com/pandas-dev/pandas/blob/main/pandas/plotting/__init__.py#L1. diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index fb50a88c6637f..c741441cf67a1 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -16,13 +16,11 @@ Development contributing_environment contributing_documentation contributing_codebase - code_style maintaining internals - test_writing debugging_extensions extending developer policies roadmap - meeting + community diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index a0e9ba53acd00..1bff2eccd3d27 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -138,7 +138,7 @@ Reviewing pull requests ----------------------- Anybody can review a pull request: regular contributors, triagers, or core-team -members. But only core-team members can merge pull requets when they're ready. +members. But only core-team members can merge pull requests when they're ready. Here are some things to check when reviewing a pull request. @@ -151,16 +151,37 @@ Here are some things to check when reviewing a pull request. for regression fixes and small bug fixes, the next minor milestone otherwise) * Changes should comply with our :ref:`policies.version`. + +.. _maintaining.backporting: + Backporting ----------- -In the case you want to apply changes to a stable branch from a newer branch then you -can comment:: +pandas supports point releases (e.g. ``1.4.3``) that aim to: + +1. Fix bugs in new features introduced in the first minor version release. + + * e.g. If a new feature was added in ``1.4`` and contains a bug, a fix can be applied in ``1.4.3`` + +2. Fix bugs that used to work in a few minor releases prior. There should be agreement between core team members that a backport is appropriate. + + * e.g. If a feature worked in ``1.2`` and stopped working since ``1.3``, a fix can be applied in ``1.4.3``. + +Since pandas minor releases are based on Github branches (e.g. point release of ``1.4`` are based off the ``1.4.x`` branch), +"backporting" means merging a pull request fix to the ``main`` branch and correct minor branch associated with the next point release. + +By default, if a pull request is assigned to the next point release milestone within the Github interface, +the backporting process should happen automatically by the ``@meeseeksdev`` bot once the pull request is merged. +A new pull request will be made backporting the pull request to the correct version branch. +Sometimes due to merge conflicts, a manual pull request will need to be made addressing the code conflict. + +If the bot does not automatically start the backporting process, you can also write a Github comment in the merged pull request +to trigger the backport:: @meeseeksdev backport version-branch This will trigger a workflow which will backport a given change to a branch -(e.g. @meeseeksdev backport 1.2.x) +(e.g. @meeseeksdev backport 1.4.x) Cleaning up old issues ---------------------- @@ -204,6 +225,18 @@ The full process is outlined in our `governance documents`_. In summary, we're happy to give triage permissions to anyone who shows interest by being helpful on the issue tracker. +The required steps for adding a maintainer are: + +1. Contact the contributor and ask their interest to join. +2. Add the contributor to the appropriate `Github Team `_ if accepted the invitation. + + * ``pandas-core`` is for core team members + * ``pandas-triage`` is for pandas triage members + +3. Add the contributor to the pandas Google group. +4. Create a pull request to add the contributor's Github handle to ``pandas-dev/pandas/web/pandas/config.yml``. +5. Create a pull request to add the contributor's name/Github handle to the `governance document `_. + The current list of core-team members is at https://github.com/pandas-dev/pandas-governance/blob/master/people.md @@ -236,5 +269,40 @@ a milestone before tagging, you can request the bot to backport it with: @Meeseeksdev backport +.. _maintaining.asv-machine: + +Benchmark machine +----------------- + +The team currently owns dedicated hardware for hosting a website for pandas' ASV performance benchmark. The results +are published to http://pandas.pydata.org/speed/pandas/ + +Configuration +````````````` + +The machine can be configured with the `Ansible `_ playbook in https://github.com/tomaugspurger/asv-runner. + +Publishing +`````````` + +The results are published to another Github repository, https://github.com/tomaugspurger/asv-collection. +Finally, we have a cron job on our docs server to pull from https://github.com/tomaugspurger/asv-collection, to serve them from ``/speed``. +Ask Tom or Joris for access to the webserver. + +Debugging +````````` + +The benchmarks are scheduled by Airflow. It has a dashboard for viewing and debugging the results. You'll need to setup an SSH tunnel to view them + + ssh -L 8080:localhost:8080 pandas@panda.likescandy.com + + +.. _maintaining.release: + +Release process +--------------- + +The process for releasing a new version of pandas can be found at https://github.com/pandas-dev/pandas-release + .. _governance documents: https://github.com/pandas-dev/pandas-governance -.. _list of permissions: https://help.github.com/en/github/setting-up-and-managing-organizations-and-teams/repository-permission-levels-for-an-organization +.. _list of permissions: https://docs.github.com/en/organizations/managing-access-to-your-organizations-repositories/repository-roles-for-an-organization diff --git a/doc/source/development/meeting.rst b/doc/source/development/meeting.rst deleted file mode 100644 index 35826af5912c2..0000000000000 --- a/doc/source/development/meeting.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. _meeting: - -================== -Developer meetings -================== - -We hold regular developer meetings on the second Wednesday -of each month at 18:00 UTC. These meetings and their minutes are open to -the public. All are welcome to join. - -Minutes -------- - -The minutes of past meetings are available in `this Google Document `__. - -Calendar --------- - -This calendar shows all the developer meetings. - -.. raw:: html - - - -You can subscribe to this calendar with the following links: - -* `iCal `__ -* `Google calendar `__ - -Additionally, we'll sometimes have one-off meetings on specific topics. -These will be published on the same calendar. diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index f8e6bda2085d8..d75262c08dfd6 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -51,7 +51,7 @@ pandas may change the behavior of experimental features at any time. Python support ~~~~~~~~~~~~~~ -pandas will only drop support for specific Python versions (e.g. 3.6.x, 3.7.x) in -pandas **major** or **minor** releases. +pandas mirrors the `NumPy guidelines for Python support `__. + .. _SemVer: https://semver.org diff --git a/doc/source/development/roadmap.rst b/doc/source/development/roadmap.rst index 37e45bf5a42b5..f935c27d9917d 100644 --- a/doc/source/development/roadmap.rst +++ b/doc/source/development/roadmap.rst @@ -74,8 +74,7 @@ types. This includes consistent behavior in all operations (indexing, arithmetic operations, comparisons, etc.). There has been discussion of eventually making the new semantics the default. -This has been discussed at -`github #28095 `__ (and +This has been discussed at :issue:`28095` (and linked issues), and described in more detail in this `design doc `__. @@ -129,8 +128,51 @@ We propose that it should only work with positional indexing, and the translatio to positions should be entirely done at a higher level. Indexing is a complicated API with many subtleties. This refactor will require care -and attention. More details are discussed at -https://github.com/pandas-dev/pandas/wiki/(Tentative)-rules-for-restructuring-indexing-code +and attention. The following principles should inspire refactoring of indexing code and +should result on cleaner, simpler, and more performant code. + +1. **Label indexing must never involve looking in an axis twice for the same label(s).** +This implies that any validation step must either: + + * limit validation to general features (e.g. dtype/structure of the key/index), or + * reuse the result for the actual indexing. + +2. **Indexers must never rely on an explicit call to other indexers.** +For instance, it is OK to have some internal method of ``.loc`` call some +internal method of ``__getitem__`` (or of their common base class), +but never in the code flow of ``.loc`` should ``the_obj[something]`` appear. + +3. **Execution of positional indexing must never involve labels** (as currently, sadly, happens). +That is, the code flow of a getter call (or a setter call in which the right hand side is non-indexed) +to ``.iloc`` should never involve the axes of the object in any way. + +4. **Indexing must never involve accessing/modifying values** (i.e., act on ``._data`` or ``.values``) **more than once.** +The following steps must hence be clearly decoupled: + + * find positions we need to access/modify on each axis + * (if we are accessing) derive the type of object we need to return (dimensionality) + * actually access/modify the values + * (if we are accessing) construct the return object + +5. As a corollary to the decoupling between 4.i and 4.iii, **any code which deals on how data is stored** +(including any combination of handling multiple dtypes, and sparse storage, categoricals, third-party types) +**must be independent from code that deals with identifying affected rows/columns**, +and take place only once step 4.i is completed. + + * In particular, such code should most probably not live in ``pandas/core/indexing.py`` + * ... and must not depend in any way on the type(s) of axes (e.g. no ``MultiIndex`` special cases) + +6. As a corollary to point 1.i, **``Index`` (sub)classes must provide separate methods for any desired validity check of label(s) which does not involve actual lookup**, +on the one side, and for any required conversion/adaptation/lookup of label(s), on the other. + +7. **Use of trial and error should be limited**, and anyway restricted to catch only exceptions +which are actually expected (typically ``KeyError``). + + * In particular, code should never (intentionally) raise new exceptions in the ``except`` portion of a ``try... exception`` + +8. **Any code portion which is not specific to setters and getters must be shared**, +and when small differences in behavior are expected (e.g. getting with ``.loc`` raises for +missing labels, setting still doesn't), they can be managed with a specific parameter. Numba-accelerated operations ---------------------------- @@ -205,4 +247,4 @@ We improved the pandas documentation * :ref:`getting_started` contains a number of resources intended for new pandas users coming from a variety of backgrounds (:issue:`26831`). -.. _pydata-sphinx-theme: https://github.com/pandas-dev/pydata-sphinx-theme +.. _pydata-sphinx-theme: https://github.com/pydata/pydata-sphinx-theme diff --git a/doc/source/development/test_writing.rst b/doc/source/development/test_writing.rst deleted file mode 100644 index 76eae505471b7..0000000000000 --- a/doc/source/development/test_writing.rst +++ /dev/null @@ -1,167 +0,0 @@ -.. _test_organization: - -Test organization -================= -Ideally, there should be one, and only one, obvious place for a test to reside. -Until we reach that ideal, these are some rules of thumb for where a test should -be located. - -1. Does your test depend only on code in ``pd._libs.tslibs``? - This test likely belongs in one of: - - - tests.tslibs - - .. note:: - - No file in ``tests.tslibs`` should import from any pandas modules - outside of ``pd._libs.tslibs`` - - - tests.scalar - - tests.tseries.offsets - -2. Does your test depend only on code in pd._libs? - This test likely belongs in one of: - - - tests.libs - - tests.groupby.test_libgroupby - -3. Is your test for an arithmetic or comparison method? - This test likely belongs in one of: - - - tests.arithmetic - - .. note:: - - These are intended for tests that can be shared to test the behavior - of DataFrame/Series/Index/ExtensionArray using the ``box_with_array`` - fixture. - - - tests.frame.test_arithmetic - - tests.series.test_arithmetic - -4. Is your test for a reduction method (min, max, sum, prod, ...)? - This test likely belongs in one of: - - - tests.reductions - - .. note:: - - These are intended for tests that can be shared to test the behavior - of DataFrame/Series/Index/ExtensionArray. - - - tests.frame.test_reductions - - tests.series.test_reductions - - tests.test_nanops - -5. Is your test for an indexing method? - This is the most difficult case for deciding where a test belongs, because - there are many of these tests, and many of them test more than one method - (e.g. both ``Series.__getitem__`` and ``Series.loc.__getitem__``) - - A) Is the test specifically testing an Index method (e.g. ``Index.get_loc``, - ``Index.get_indexer``)? - This test likely belongs in one of: - - - tests.indexes.test_indexing - - tests.indexes.fooindex.test_indexing - - Within that files there should be a method-specific test class e.g. - ``TestGetLoc``. - - In most cases, neither ``Series`` nor ``DataFrame`` objects should be - needed in these tests. - - B) Is the test for a Series or DataFrame indexing method *other* than - ``__getitem__`` or ``__setitem__``, e.g. ``xs``, ``where``, ``take``, - ``mask``, ``lookup``, or ``insert``? - This test likely belongs in one of: - - - tests.frame.indexing.test_methodname - - tests.series.indexing.test_methodname - - C) Is the test for any of ``loc``, ``iloc``, ``at``, or ``iat``? - This test likely belongs in one of: - - - tests.indexing.test_loc - - tests.indexing.test_iloc - - tests.indexing.test_at - - tests.indexing.test_iat - - Within the appropriate file, test classes correspond to either types of - indexers (e.g. ``TestLocBooleanMask``) or major use cases - (e.g. ``TestLocSetitemWithExpansion``). - - See the note in section D) about tests that test multiple indexing methods. - - D) Is the test for ``Series.__getitem__``, ``Series.__setitem__``, - ``DataFrame.__getitem__``, or ``DataFrame.__setitem__``? - This test likely belongs in one of: - - - tests.series.test_getitem - - tests.series.test_setitem - - tests.frame.test_getitem - - tests.frame.test_setitem - - If many cases such a test may test multiple similar methods, e.g. - - .. code-block:: python - - import pandas as pd - import pandas._testing as tm - - def test_getitem_listlike_of_ints(): - ser = pd.Series(range(5)) - - result = ser[[3, 4]] - expected = pd.Series([2, 3]) - tm.assert_series_equal(result, expected) - - result = ser.loc[[3, 4]] - tm.assert_series_equal(result, expected) - - In cases like this, the test location should be based on the *underlying* - method being tested. Or in the case of a test for a bugfix, the location - of the actual bug. So in this example, we know that ``Series.__getitem__`` - calls ``Series.loc.__getitem__``, so this is *really* a test for - ``loc.__getitem__``. So this test belongs in ``tests.indexing.test_loc``. - -6. Is your test for a DataFrame or Series method? - - A) Is the method a plotting method? - This test likely belongs in one of: - - - tests.plotting - - B) Is the method an IO method? - This test likely belongs in one of: - - - tests.io - - C) Otherwise - This test likely belongs in one of: - - - tests.series.methods.test_mymethod - - tests.frame.methods.test_mymethod - - .. note:: - - If a test can be shared between DataFrame/Series using the - ``frame_or_series`` fixture, by convention it goes in the - ``tests.frame`` file. - -7. Is your test for an Index method, not depending on Series/DataFrame? - This test likely belongs in one of: - - - tests.indexes - -8) Is your test for one of the pandas-provided ExtensionArrays (``Categorical``, - ``DatetimeArray``, ``TimedeltaArray``, ``PeriodArray``, ``IntervalArray``, - ``PandasArray``, ``FloatArray``, ``BoolArray``, ``StringArray``)? - This test likely belongs in one of: - - - tests.arrays - -9) Is your test for *all* ExtensionArray subclasses (the "EA Interface")? - This test likely belongs in one of: - - - tests.extension diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index ee061e7b7d3e6..166162a4763bf 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -19,7 +19,7 @@ development to remain focused around it's original requirements. This is an inexhaustive list of projects that build on pandas in order to provide tools in the PyData space. For a list of projects that depend on pandas, see the -`libraries.io usage page for pandas `_ +`Github network dependents for pandas `_ or `search pypi for pandas `_. We'd like to make it easier for users to find these projects, if you know of other @@ -30,16 +30,18 @@ substantial projects that you feel should be on this list, please let us know. Data cleaning and validation ---------------------------- -`Pyjanitor `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Pyjanitor `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Pyjanitor provides a clean API for cleaning data, using method chaining. -`Engarde `__ +`Pandera `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Engarde is a lightweight library used to explicitly state assumptions about your datasets -and check that they're *actually* true. +Pandera provides a flexible and expressive API for performing data validation on dataframes +to make data processing pipelines more readable and robust. +Dataframes contain information that pandera explicitly validates at runtime. This is useful in +production-critical data pipelines or reproducible research settings. `pandas-path `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -69,19 +71,19 @@ a long-standing special relationship with pandas. Statsmodels provides powerful econometrics, analysis and modeling functionality that is out of pandas' scope. Statsmodels leverages pandas objects as the underlying data container for computation. -`sklearn-pandas `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`sklearn-pandas `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Use pandas DataFrames in your `scikit-learn `__ ML pipeline. `Featuretools `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. `Compose `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Compose is a machine learning tool for labeling data and prediction engineering. It allows you to structure the labeling process by parameterizing prediction problems and transforming time-driven relational data into target values with cutoff times that can be used for supervised learning. @@ -113,8 +115,8 @@ simplicity produces beautiful and effective visualizations with a minimal amount of code. Altair works with pandas DataFrames. -`Bokeh `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Bokeh `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Bokeh is a Python interactive visualization library for large datasets that natively uses the latest web technologies. Its goal is to provide elegant, concise construction of novel @@ -145,7 +147,7 @@ estimation while plotting, aggregating across observations and visualizing the fit of statistical models to emphasize patterns in a dataset. `plotnine `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hadley Wickham's `ggplot2 `__ is a foundational exploratory visualization package for the R language. Based on `"The Grammar of Graphics" `__ it @@ -159,10 +161,10 @@ A good implementation for Python users is `has2k1/plotnine `__ leverages `Vega `__ to create plots within Jupyter Notebook. -`Plotly `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Plotly `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -`Plotly’s `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `cloud `__, `offline `__, or `on-premise `__ accounts for private use. +`Plotly’s `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `offline `__, or `on-premise `__ accounts for private use. `Lux `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -177,7 +179,7 @@ A good implementation for Python users is `has2k1/plotnine `__ that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a `powerful, intuitive language `__ that allow users to create `Altair `__, `matplotlib `__, or `Vega-Lite `__ visualizations without having to think at the level of code. +By printing out a dataframe, Lux automatically `recommends a set of visualizations `__ that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a `powerful, intuitive language `__ that allow users to create `Altair `__, `matplotlib `__, or `Vega-Lite `__ visualizations without having to think at the level of code. `Qtpandas `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -202,8 +204,7 @@ invoked with the following command dtale.show(df) D-Tale integrates seamlessly with Jupyter notebooks, Python terminals, Kaggle -& Google Colab. Here are some demos of the `grid `__ -and `chart-builder `__. +& Google Colab. Here are some demos of the `grid `__. `hvplot `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -218,7 +219,7 @@ It can be loaded as a native pandas plotting backend via .. _ecosystem.ide: IDE ------- +--- `IPython `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -262,7 +263,7 @@ debugging and profiling functionality of a software development tool with the data exploration, interactive execution, deep inspection and rich visualization capabilities of a scientific environment like MATLAB or Rstudio. -Its `Variable Explorer `__ +Its `Variable Explorer `__ allows users to view, manipulate and edit pandas ``Index``, ``Series``, and ``DataFrame`` objects like a "spreadsheet", including copying and modifying values, sorting, displaying a "heatmap", converting data types and more. @@ -272,9 +273,9 @@ Spyder can also import data from a variety of plain text and binary files or the clipboard into a new pandas DataFrame via a sophisticated import wizard. Most pandas classes, methods and data attributes can be autocompleted in -Spyder's `Editor `__ and -`IPython Console `__, -and Spyder's `Help pane `__ can retrieve +Spyder's `Editor `__ and +`IPython Console `__, +and Spyder's `Help pane `__ can retrieve and render Numpydoc documentation on pandas objects in rich text with Sphinx both automatically and on-demand. @@ -310,8 +311,8 @@ The following data feeds are available: * Stooq Index Data * MOEX Data -`Quandl/Python `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Quandl/Python `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Quandl API for Python wraps the Quandl REST API to return pandas DataFrames with timeseries indexes. @@ -322,8 +323,8 @@ PyDatastream is a Python interface to the REST API to return indexed pandas DataFrames with financial data. This package requires valid credentials for this API (non free). -`pandaSDMX `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`pandaSDMX `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pandaSDMX is a library to retrieve and acquire statistical data and metadata disseminated in `SDMX `_ 2.1, an ISO-standard @@ -355,13 +356,22 @@ with pandas. Domain specific --------------- -`Geopandas `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Geopandas `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Geopandas extends pandas data objects to include geographic information which support geometric operations. If your work entails maps and geographical coordinates, and you love pandas, you should take a close look at Geopandas. +`staircase `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +staircase is a data analysis package, built upon pandas and numpy, for modelling and +manipulation of mathematical step functions. It provides a rich variety of arithmetic +operations, relational operations, logical operations, statistical operations and +aggregations for step functions defined over real numbers, datetime and timedelta domains. + + `xarray `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -396,7 +406,7 @@ any Delta table into Pandas dataframe. .. _ecosystem.out-of-core: Out-of-core -------------- +----------- `Blaze `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -434,8 +444,8 @@ can selectively scale parts of their pandas DataFrame applications. print(df3) -`Dask `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Dask `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Dask is a flexible parallel computing library for analytics. Dask provides a familiar ``DataFrame`` interface for out-of-core, parallel and distributed computing. @@ -445,6 +455,12 @@ provides a familiar ``DataFrame`` interface for out-of-core, parallel and distri Dask-ML enables parallel and distributed machine learning using Dask alongside existing machine learning libraries like Scikit-Learn, XGBoost, and TensorFlow. +`Ibis `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Ibis offers a standard way to write analytics code, that can be run in multiple engines. It helps in bridging the gap between local Python environments (like pandas) and remote storage and execution systems like Hadoop components (like HDFS, Impala, Hive, Spark) and SQL databases (Postgres, etc.). + + `Koalas `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -467,8 +483,8 @@ time-consuming tasks like ingesting data (``read_csv``, ``read_excel``, df = pd.read_csv("big.csv") # use all your cores! -`Odo `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Odo `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Odo provides a uniform API for moving data between different formats. It uses pandas own ``read_csv`` for CSV IO and leverages many existing packages such as @@ -492,8 +508,8 @@ If also displays progress bars. df.parallel_apply(func) -`Vaex `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Vaex `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a Python library for Out-of-Core DataFrames (similar to pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted). @@ -524,7 +540,7 @@ Pandas-Genomics provides extension types, extension arrays, and extension access `Pint-Pandas`_ ~~~~~~~~~~~~~~ -``Pint-Pandas `` provides an extension type for +`Pint-Pandas `_ provides an extension type for storing numeric arrays with units. These arrays can be stored inside pandas' Series and DataFrame. Operations between Series and DataFrame columns which use pint's extension array are then units aware. @@ -532,7 +548,7 @@ use pint's extension array are then units aware. `Text Extensions for Pandas`_ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``Text Extensions for Pandas `` +`Text Extensions for Pandas `_ provides extension types to cover common data structures for representing natural language data, plus library integrations that convert the outputs of popular natural language processing libraries into Pandas DataFrames. @@ -557,6 +573,7 @@ Library Accessor Classes Description `composeml`_ ``slice`` ``DataFrame`` Provides a generator for enhanced data slicing. `datatest`_ ``validate`` ``Series``, ``DataFrame``, ``Index`` Provides validation, differences, and acceptance managers. `woodwork`_ ``ww`` ``Series``, ``DataFrame`` Provides physical, logical, and semantic data typing information for Series and DataFrames. +`staircase`_ ``sc`` ``Series`` Provides methods for querying, aggregating and plotting step functions ================== ============ ==================================== =============================================================================== .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest @@ -567,5 +584,19 @@ Library Accessor Classes Description .. _pathlib.Path: https://docs.python.org/3/library/pathlib.html .. _pint-pandas: https://github.com/hgrecco/pint-pandas .. _composeml: https://github.com/alteryx/compose -.. _datatest: https://datatest.readthedocs.io/ +.. _datatest: https://datatest.readthedocs.io/en/stable/ .. _woodwork: https://github.com/alteryx/woodwork +.. _staircase: https://www.staircase.dev/ + +Development tools +----------------- + +`pandas-stubs `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +While pandas repository is partially typed, the package itself doesn't expose this information for external use. +Install pandas-stubs to enable basic type coverage of pandas API. + +Learn more by reading through :issue:`14468`, :issue:`26766`, :issue:`28142`. + +See installation and usage instructions on the `github page `__. diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index 864081002086b..f91f4218c3429 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -31,7 +31,7 @@ Quick reference We'll start off with a quick reference guide pairing some common R operations using `dplyr -`__ with +`__ with pandas equivalents. @@ -326,8 +326,8 @@ table below shows how these data structures could be mapped in Python. | data.frame | dataframe | +------------+-------------------------------+ -|ddply|_ -~~~~~~~~ +ddply +~~~~~ An expression using a data.frame called ``df`` in R where you want to summarize ``x`` by ``month``: @@ -372,8 +372,8 @@ For more details and examples see :ref:`the groupby documentation reshape / reshape2 ------------------ -|meltarray|_ -~~~~~~~~~~~~~ +meltarray +~~~~~~~~~ An expression using a 3 dimensional array called ``a`` in R where you want to melt it into a data.frame: @@ -390,8 +390,8 @@ In Python, since ``a`` is a list, you can simply use list comprehension. a = np.array(list(range(1, 24)) + [np.NAN]).reshape(2, 3, 4) pd.DataFrame([tuple(list(x) + [val]) for x, val in np.ndenumerate(a)]) -|meltlist|_ -~~~~~~~~~~~~ +meltlist +~~~~~~~~ An expression using a list called ``a`` in R where you want to melt it into a data.frame: @@ -412,8 +412,8 @@ In Python, this list would be a list of tuples, so For more details and examples see :ref:`the Into to Data Structures documentation `. -|meltdf|_ -~~~~~~~~~~~~~~~~ +meltdf +~~~~~~ An expression using a data.frame called ``cheese`` in R where you want to reshape the data.frame: @@ -447,8 +447,8 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent: For more details and examples see :ref:`the reshaping documentation `. -|cast|_ -~~~~~~~ +cast +~~~~ In R ``acast`` is an expression using a data.frame called ``df`` in R to cast into a higher dimensional array: @@ -577,20 +577,5 @@ For more details and examples see :ref:`categorical introduction ` .. |subset| replace:: ``subset`` .. _subset: https://stat.ethz.ch/R-manual/R-patched/library/base/html/subset.html -.. |ddply| replace:: ``ddply`` -.. _ddply: https://cran.r-project.org/web/packages/plyr/plyr.pdf#Rfn.ddply.1 - -.. |meltarray| replace:: ``melt.array`` -.. _meltarray: https://cran.r-project.org/web/packages/reshape2/reshape2.pdf#Rfn.melt.array.1 - -.. |meltlist| replace:: ``melt.list`` -.. meltlist: https://cran.r-project.org/web/packages/reshape2/reshape2.pdf#Rfn.melt.list.1 - -.. |meltdf| replace:: ``melt.data.frame`` -.. meltdf: https://cran.r-project.org/web/packages/reshape2/reshape2.pdf#Rfn.melt.data.frame.1 - -.. |cast| replace:: ``cast`` -.. cast: https://cran.r-project.org/web/packages/reshape2/reshape2.pdf#Rfn.cast.1 - .. |factor| replace:: ``factor`` .. _factor: https://stat.ethz.ch/R-manual/R-devel/library/base/html/factor.html diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst index 54b45dc20db20..595f3c85a9dc2 100644 --- a/doc/source/getting_started/comparison/comparison_with_sas.rst +++ b/doc/source/getting_started/comparison/comparison_with_sas.rst @@ -96,7 +96,7 @@ Reading external data Like SAS, pandas provides utilities for reading in data from many formats. The ``tips`` dataset, found within the pandas -tests (`csv `_) +tests (`csv `_) will be used in many of the following examples. SAS provides ``PROC IMPORT`` to read csv data into a data set. @@ -112,8 +112,8 @@ The pandas method is :func:`read_csv`, which works similarly. .. ipython:: python url = ( - "https://raw.github.com/pandas-dev/" - "pandas/master/pandas/tests/io/data/csv/tips.csv" + "https://raw.githubusercontent.com/pandas-dev/" + "pandas/main/pandas/tests/io/data/csv/tips.csv" ) tips = pd.read_csv(url) tips @@ -335,7 +335,7 @@ Extracting substring by position ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SAS extracts a substring from a string based on its position with the -`SUBSTR `__ function. +`SUBSTR `__ function. .. code-block:: sas @@ -538,7 +538,7 @@ This means that the size of data able to be loaded in pandas is limited by your machine's memory, but also that the operations on that data may be faster. If out of core processing is needed, one possibility is the -`dask.dataframe `_ +`dask.dataframe `_ library (currently in development) which provides a subset of pandas functionality for an on-disk ``DataFrame`` diff --git a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst index bdd0f7d8cfddf..d55b669d94a87 100644 --- a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst +++ b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst @@ -11,7 +11,7 @@ of how various spreadsheet operations would be performed using pandas. This page terminology and link to documentation for Excel, but much will be the same/similar in `Google Sheets `_, `LibreOffice Calc `_, -`Apple Numbers `_, and other +`Apple Numbers `_, and other Excel-compatible spreadsheet software. .. include:: includes/introduction.rst @@ -85,14 +85,14 @@ In a spreadsheet, `values can be typed directly into cells `__ +Both `Excel `__ and :ref:`pandas <10min_tut_02_read_write>` can import data from various sources in various formats. CSV ''' -Let's load and display the `tips `_ +Let's load and display the `tips `_ dataset from the pandas tests, which is a CSV file. In Excel, you would download and then `open the CSV `_. In pandas, you pass the URL or local path of the CSV file to :func:`~pandas.read_csv`: @@ -100,8 +100,8 @@ In pandas, you pass the URL or local path of the CSV file to :func:`~pandas.read .. ipython:: python url = ( - "https://raw.github.com/pandas-dev" - "/pandas/master/pandas/tests/io/data/csv/tips.csv" + "https://raw.githubusercontent.com/pandas-dev" + "/pandas/main/pandas/tests/io/data/csv/tips.csv" ) tips = pd.read_csv(url) tips @@ -435,13 +435,14 @@ The equivalent in pandas: Adding a row ~~~~~~~~~~~~ -Assuming we are using a :class:`~pandas.RangeIndex` (numbered ``0``, ``1``, etc.), we can use :meth:`DataFrame.append` to add a row to the bottom of a ``DataFrame``. +Assuming we are using a :class:`~pandas.RangeIndex` (numbered ``0``, ``1``, etc.), we can use :func:`concat` to add a row to the bottom of a ``DataFrame``. .. ipython:: python df - new_row = {"class": "E", "student_count": 51, "all_pass": True} - df.append(new_row, ignore_index=True) + new_row = pd.DataFrame([["E", 51, True]], + columns=["class", "student_count", "all_pass"]) + pd.concat([df, new_row]) Find and Replace diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index 49a21f87382b3..a6d9d65e85645 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -17,8 +17,8 @@ structure. .. ipython:: python url = ( - "https://raw.github.com/pandas-dev" - "/pandas/master/pandas/tests/io/data/csv/tips.csv" + "https://raw.githubusercontent.com/pandas-dev" + "/pandas/main/pandas/tests/io/data/csv/tips.csv" ) tips = pd.read_csv(url) tips @@ -233,6 +233,12 @@ default, :meth:`~pandas.DataFrame.join` will join the DataFrames on their indice parameters allowing you to specify the type of join to perform (``LEFT``, ``RIGHT``, ``INNER``, ``FULL``) or the columns to join on (column names or indices). +.. warning:: + + If both key columns contain rows where the key is a null value, those + rows will be matched against each other. This is different from usual SQL + join behaviour and can lead to unexpected results. + .. ipython:: python df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)}) diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst index 94c45adcccc82..b4b0c42d1db1d 100644 --- a/doc/source/getting_started/comparison/comparison_with_stata.rst +++ b/doc/source/getting_started/comparison/comparison_with_stata.rst @@ -92,7 +92,7 @@ Reading external data Like Stata, pandas provides utilities for reading in data from many formats. The ``tips`` data set, found within the pandas -tests (`csv `_) +tests (`csv `_) will be used in many of the following examples. Stata provides ``import delimited`` to read csv data into a data set in memory. @@ -108,8 +108,8 @@ the data set if presented with a url. .. ipython:: python url = ( - "https://raw.github.com/pandas-dev" - "/pandas/master/pandas/tests/io/data/csv/tips.csv" + "https://raw.githubusercontent.com/pandas-dev" + "/pandas/main/pandas/tests/io/data/csv/tips.csv" ) tips = pd.read_csv(url) tips @@ -496,6 +496,6 @@ Disk vs memory pandas and Stata both operate exclusively in memory. This means that the size of data able to be loaded in pandas is limited by your machine's memory. If out of core processing is needed, one possibility is the -`dask.dataframe `_ +`dask.dataframe `_ library, which provides a subset of pandas functionality for an on-disk ``DataFrame``. diff --git a/doc/source/getting_started/comparison/includes/nth_word.rst b/doc/source/getting_started/comparison/includes/nth_word.rst index 7af0285005d5b..20e2ec47a8c9d 100644 --- a/doc/source/getting_started/comparison/includes/nth_word.rst +++ b/doc/source/getting_started/comparison/includes/nth_word.rst @@ -5,5 +5,5 @@ word by index. Note there are more powerful approaches should you need them. firstlast = pd.DataFrame({"String": ["John Smith", "Jane Cook"]}) firstlast["First_Name"] = firstlast["String"].str.split(" ", expand=True)[0] - firstlast["Last_Name"] = firstlast["String"].str.rsplit(" ", expand=True)[0] + firstlast["Last_Name"] = firstlast["String"].str.rsplit(" ", expand=True)[1] firstlast diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 88e54421daa11..31eaa2367b683 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -12,7 +12,7 @@ cross platform distribution for data analysis and scientific computing. This is the recommended installation method for most users. Instructions for installing from source, -`PyPI `__, `ActivePython `__, various Linux distributions, or a +`PyPI `__, `ActivePython `__, various Linux distributions, or a `development version `__ are also provided. .. _install.version: @@ -20,7 +20,7 @@ Instructions for installing from source, Python version support ---------------------- -Officially Python 3.7.1 and above, 3.8, and 3.9. +Officially Python 3.8, 3.9, 3.10 and 3.11. Installing pandas ----------------- @@ -47,7 +47,7 @@ rest of the `SciPy `__ stack without needing to install anything else, and without needing to wait for any software to be compiled. Installation instructions for `Anaconda `__ -`can be found here `__. +`can be found here `__. A full list of the packages available as part of the `Anaconda `__ distribution @@ -70,18 +70,18 @@ and involves downloading the installer which is a few hundred megabytes in size. If you want to have more control on which packages, or have a limited internet bandwidth, then installing pandas with -`Miniconda `__ may be a better solution. +`Miniconda `__ may be a better solution. -`Conda `__ is the package manager that the +`Conda `__ is the package manager that the `Anaconda `__ distribution is built upon. It is a package manager that is both cross-platform and language agnostic (it can play a similar role to a pip and virtualenv combination). `Miniconda `__ allows you to create a minimal self contained Python installation, and then use the -`Conda `__ command to install additional packages. +`Conda `__ command to install additional packages. -First you will need `Conda `__ to be installed and +First you will need `Conda `__ to be installed and downloading and running the `Miniconda `__ will do this for you. The installer @@ -132,6 +132,9 @@ Installing from PyPI pandas can be installed via pip from `PyPI `__. +.. note:: + You must have ``pip>=19.3`` to install from PyPI. + :: pip install pandas @@ -140,8 +143,8 @@ Installing with ActivePython ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Installation instructions for -`ActivePython `__ can be found -`here `__. Versions +`ActivePython `__ can be found +`here `__. Versions 2.7, 3.5 and 3.6 include pandas. Installing using your Linux distribution's package manager. @@ -155,10 +158,10 @@ The commands in this table will install pandas for Python 3 from your distributi Debian, stable, `official Debian repository `__ , ``sudo apt-get install python3-pandas`` - Debian & Ubuntu, unstable (latest packages), `NeuroDebian `__ , ``sudo apt-get install python3-pandas`` + Debian & Ubuntu, unstable (latest packages), `NeuroDebian `__ , ``sudo apt-get install python3-pandas`` Ubuntu, stable, `official Ubuntu repository `__ , ``sudo apt-get install python3-pandas`` OpenSuse, stable, `OpenSuse Repository `__ , ``zypper in python3-pandas`` - Fedora, stable, `official Fedora repository `__ , ``dnf install python3-pandas`` + Fedora, stable, `official Fedora repository `__ , ``dnf install python3-pandas`` Centos/RHEL, stable, `EPEL repository `__ , ``yum install python3-pandas`` **However**, the packages in the linux package managers are often a few versions behind, so @@ -196,22 +199,33 @@ the code base as of this writing. To run it on your machine to verify that everything is working (and that you have all of the dependencies, soft and hard, installed), make sure you have `pytest `__ >= 6.0 and `Hypothesis -`__ >= 3.58, then run: +`__ >= 6.13.0, then run: :: >>> pd.test() - running: pytest --skip-slow --skip-network C:\Users\TP\Anaconda3\envs\py36\lib\site-packages\pandas - ============================= test session starts ============================= - platform win32 -- Python 3.6.2, pytest-3.6.0, py-1.4.34, pluggy-0.4.0 - rootdir: C:\Users\TP\Documents\Python\pandasdev\pandas, inifile: setup.cfg - collected 12145 items / 3 skipped + running: pytest --skip-slow --skip-network --skip-db /home/user/anaconda3/lib/python3.9/site-packages/pandas + + ============================= test session starts ============================== + platform linux -- Python 3.9.7, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 + rootdir: /home/user + plugins: dash-1.19.0, anyio-3.5.0, hypothesis-6.29.3 + collected 154975 items / 4 skipped / 154971 selected + ........................................................................ [ 0%] + ........................................................................ [ 99%] + ....................................... [100%] + + ==================================== ERRORS ==================================== + + =================================== FAILURES =================================== - ..................................................................S...... - ........S................................................................ - ......................................................................... + =============================== warnings summary =============================== - ==================== 12130 passed, 12 skipped in 368.339 seconds ===================== + =========================== short test summary info ============================ + + = 1 failed, 146194 passed, 7402 skipped, 1367 xfailed, 5 xpassed, 197 warnings, 10 errors in 1090.16s (0:18:10) = + +This is just an example of what information is shown. You might see a slightly different result as what is shown above. .. _install.dependencies: @@ -221,9 +235,9 @@ Dependencies ================================================================ ========================== Package Minimum supported version ================================================================ ========================== -`NumPy `__ 1.17.3 -`python-dateutil `__ 2.7.3 -`pytz `__ 2017.3 +`NumPy `__ 1.20.3 +`python-dateutil `__ 2.8.1 +`pytz `__ 2020.1 ================================================================ ========================== .. _install.recommended_dependencies: @@ -233,11 +247,11 @@ Recommended dependencies * `numexpr `__: for accelerating certain numerical operations. ``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups. - If installed, must be Version 2.7.0 or higher. + If installed, must be Version 2.7.3 or higher. * `bottleneck `__: for accelerating certain types of ``nan`` evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. If installed, - must be Version 1.2.1 or higher. + must be Version 1.3.2 or higher. .. note:: @@ -256,16 +270,32 @@ For example, :func:`pandas.read_hdf` requires the ``pytables`` package, while optional dependency is not installed, pandas will raise an ``ImportError`` when the method requiring that dependency is called. +Timezones +^^^^^^^^^ + +========================= ========================= ============================================================= +Dependency Minimum Version Notes +========================= ========================= ============================================================= +tzdata 2022.1(pypi)/ Allows the use of ``zoneinfo`` timezones with pandas. + 2022a(for system tzdata) **Note**: You only need to install the pypi package if your + system does not already provide the IANA tz database. + However, the minimum tzdata version still applies, even if it + is not enforced through an error. + + If you would like to keep your system tzdata version updated, + it is recommended to use the ``tzdata`` package from + conda-forge. +========================= ========================= ============================================================= + Visualization ^^^^^^^^^^^^^ ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -setuptools 38.6.0 Utils for entry points of plotting backend -matplotlib 2.2.3 Plotting library -Jinja2 2.10 Conditional formatting with DataFrame.style -tabulate 0.8.7 Printing in Markdown-friendly format (see `tabulate`_) +matplotlib 3.3.2 Plotting library +Jinja2 3.0.0 Conditional formatting with DataFrame.style +tabulate 0.8.9 Printing in Markdown-friendly format (see `tabulate`_) ========================= ================== ============================================================= Computation @@ -274,10 +304,10 @@ Computation ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -SciPy 1.12.0 Miscellaneous statistical functions -numba 0.46.0 Alternative execution engine for rolling operations +SciPy 1.7.1 Miscellaneous statistical functions +numba 0.53.1 Alternative execution engine for rolling operations (see :ref:`Enhancing Performance `) -xarray 0.12.3 pandas-like API for N-dimensional data +xarray 0.19.0 pandas-like API for N-dimensional data ========================= ================== ============================================================= Excel files @@ -286,11 +316,11 @@ Excel files ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -xlrd 1.2.0 Reading Excel +xlrd 2.0.1 Reading Excel xlwt 1.3.0 Writing Excel -xlsxwriter 1.0.2 Writing Excel -openpyxl 3.0.0 Reading / writing for xlsx files -pyxlsb 1.0.6 Reading for xlsb files +xlsxwriter 1.4.3 Writing Excel +openpyxl 3.0.7 Reading / writing for xlsx files +pyxlsb 1.0.8 Reading for xlsb files ========================= ================== ============================================================= HTML @@ -299,9 +329,9 @@ HTML ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -BeautifulSoup4 4.6.0 HTML parser for read_html -html5lib 1.0.1 HTML parser for read_html -lxml 4.3.0 HTML parser for read_html +BeautifulSoup4 4.9.3 HTML parser for read_html +html5lib 1.1 HTML parser for read_html +lxml 4.6.3 HTML parser for read_html ========================= ================== ============================================================= One of the following combinations of libraries is needed to use the @@ -334,7 +364,7 @@ XML ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -lxml 4.3.0 XML parser for read_xml and tree builder for to_xml +lxml 4.5.0 XML parser for read_xml and tree builder for to_xml ========================= ================== ============================================================= SQL databases @@ -343,9 +373,9 @@ SQL databases ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -SQLAlchemy 1.3.0 SQL support for databases other than sqlite -psycopg2 2.7 PostgreSQL engine for sqlalchemy -pymysql 0.8.1 MySQL engine for sqlalchemy +SQLAlchemy 1.4.16 SQL support for databases other than sqlite +psycopg2 2.8.6 PostgreSQL engine for sqlalchemy +pymysql 1.0.2 MySQL engine for sqlalchemy ========================= ================== ============================================================= Other data sources @@ -354,12 +384,12 @@ Other data sources ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -PyTables 3.5.1 HDF5-based reading / writing -blosc 1.17.0 Compression for HDF5 +PyTables 3.6.1 HDF5-based reading / writing +blosc 1.21.0 Compression for HDF5 zlib Compression for HDF5 fastparquet 0.4.0 Parquet reading / writing -pyarrow 0.17.0 Parquet, ORC, and feather reading / writing -pyreadstat SPSS files (.sav) reading +pyarrow 1.0.1 Parquet, ORC, and feather reading / writing +pyreadstat 1.1.2 SPSS files (.sav) reading ========================= ================== ============================================================= .. _install.warn_orc: @@ -383,10 +413,10 @@ Access data in the cloud ========================= ================== ============================================================= Dependency Minimum Version Notes ========================= ================== ============================================================= -fsspec 0.7.4 Handling files aside from simple local and HTTP -gcsfs 0.6.0 Google Cloud Storage access -pandas-gbq 0.12.0 Google Big Query access -s3fs 0.4.0 Amazon S3 access +fsspec 2021.7.0 Handling files aside from simple local and HTTP +gcsfs 2021.7.0 Google Cloud Storage access +pandas-gbq 0.15.0 Google Big Query access +s3fs 2021.08.0 Amazon S3 access ========================= ================== ============================================================= Clipboard @@ -400,3 +430,15 @@ qtpy Clipboard I/O xclip Clipboard I/O on linux xsel Clipboard I/O on linux ========================= ================== ============================================================= + + +Compression +^^^^^^^^^^^ + +========================= ================== ============================================================= +Dependency Minimum Version Notes +========================= ================== ============================================================= +brotli 0.7.0 Brotli compression +python-snappy 0.6.0 Snappy compression +Zstandard 0.15.2 Zstandard compression +========================= ================== ============================================================= diff --git a/doc/source/getting_started/intro_tutorials/03_subset_data.rst b/doc/source/getting_started/intro_tutorials/03_subset_data.rst index 4106b0e064823..291cbddff58eb 100644 --- a/doc/source/getting_started/intro_tutorials/03_subset_data.rst +++ b/doc/source/getting_started/intro_tutorials/03_subset_data.rst @@ -242,7 +242,7 @@ I want to work with passenger data for which the age is known. age_no_na.head() The :meth:`~Series.notna` conditional function returns a ``True`` for each row the -values are not an ``Null`` value. As such, this can be combined with the +values are not a ``Null`` value. As such, this can be combined with the selection brackets ``[]`` to filter the data table. .. raw:: html @@ -358,9 +358,9 @@ See the user guide section on :ref:`different choices for indexing -How to create plots in pandas? ------------------------------- - -.. image:: ../../_static/schemas/04_plot_overview.svg - :align: center - .. raw:: html