diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000000000..e704c37df3e45 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,95 @@ +version: 2.1 + +jobs: + test-arm: + machine: + image: ubuntu-2004:2022.04.1 + resource_class: arm.large + environment: + ENV_FILE: ci/deps/circle-38-arm64.yaml + PYTEST_WORKERS: auto + PATTERN: "not single_cpu and not slow and not network and not clipboard and not arm_slow and not db" + PYTEST_TARGET: "pandas" + PANDAS_CI: "1" + steps: + - checkout + - run: .circleci/setup_env.sh + - run: > + PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH + LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD + ci/run_tests.sh + build-aarch64: + parameters: + cibw-build: + type: string + machine: + image: ubuntu-2004:2022.04.1 + resource_class: arm.large + environment: + ENV_FILE: ci/deps/circle-38-arm64.yaml + TRIGGER_SOURCE: << pipeline.trigger_source >> + steps: + - checkout + - run: + name: Check if build is necessary + command: | + # Check if tag is defined or TRIGGER_SOURCE is scheduled + if [[ -n "$CIRCLE_TAG" ]]; then + echo 'export IS_PUSH="true"' >> "$BASH_ENV" + elif [[ $TRIGGER_SOURCE == "scheduled_pipeline" ]]; then + echo 'export IS_SCHEDULE_DISPATCH="true"' >> "$BASH_ENV" + # Look for the build label/[wheel build] in commit + # grep takes a regex, so need to escape brackets + elif (git log --format=oneline -n 1 $CIRCLE_SHA1) | grep -q '\[wheel build\]'; then + : # Do nothing + elif ! (curl https://api.github.com/repos/pandas-dev/pandas/issues/$CIRCLE_PR_NUMBER | jq '.labels' | grep -q 'Build'); then + circleci-agent step halt + fi + - run: + name: Build aarch64 wheels + command: | + pip3 install cibuildwheel==2.12.1 + cibuildwheel --output-dir wheelhouse + environment: + CIBW_BUILD: << parameters.cibw-build >> + + - run: + name: Install Anaconda Client & Upload Wheels + command: | + echo "Install Mambaforge" + MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/23.1.0-0/Mambaforge-23.1.0-0-Linux-aarch64.sh" + echo "Downloading $MAMBA_URL" + wget -q $MAMBA_URL -O minimamba.sh + chmod +x minimamba.sh + + MAMBA_DIR="$HOME/miniconda3" + rm -rf $MAMBA_DIR + ./minimamba.sh -b -p $MAMBA_DIR + + export PATH=$MAMBA_DIR/bin:$PATH + + mamba install -y -c conda-forge anaconda-client + + source ci/upload_wheels.sh + set_upload_vars + upload_wheels + - store_artifacts: + path: wheelhouse/ + +workflows: + test: + # Don't run trigger this one when scheduled pipeline runs + when: + not: + equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] + jobs: + - test-arm + build-wheels: + jobs: + - build-aarch64: + filters: + tags: + only: /^v.*/ + matrix: + parameters: + cibw-build: ["cp38-manylinux_aarch64", "cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64"] diff --git a/.circleci/setup_env.sh b/.circleci/setup_env.sh new file mode 100755 index 0000000000000..9499e73006862 --- /dev/null +++ b/.circleci/setup_env.sh @@ -0,0 +1,64 @@ +#!/bin/bash -e + +echo "Install Mambaforge" +MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/23.1.0-0/Mambaforge-23.1.0-0-Linux-aarch64.sh" +echo "Downloading $MAMBA_URL" +wget -q $MAMBA_URL -O minimamba.sh +chmod +x minimamba.sh + +MAMBA_DIR="$HOME/miniconda3" +rm -rf $MAMBA_DIR +./minimamba.sh -b -p $MAMBA_DIR + +export PATH=$MAMBA_DIR/bin:$PATH + +echo +echo "which conda" +which conda + +echo +echo "update conda" +conda config --set ssl_verify false +conda config --set quiet true --set always_yes true --set changeps1 false +mamba install -y -c conda-forge -n base pip setuptools + +echo "conda info -a" +conda info -a + +echo "conda list (root environment)" +conda list + +echo +# Clean up any left-over from a previous build +mamba env remove -n pandas-dev +echo "mamba env update --file=${ENV_FILE}" +# See https://github.com/mamba-org/mamba/issues/633 +mamba create -q -n pandas-dev +time mamba env update -n pandas-dev --file="${ENV_FILE}" + +echo "conda list -n pandas-dev" +conda list -n pandas-dev + +echo "activate pandas-dev" +source activate pandas-dev + +# Explicitly set an environment variable indicating that this is pandas' CI environment. +# +# This allows us to enable things like -Werror that shouldn't be activated in +# downstream CI jobs that may also build pandas from source. +export PANDAS_CI=1 + +if pip list | grep -q ^pandas; then + echo + echo "remove any installed pandas package w/o removing anything else" + pip uninstall -y pandas || true +fi + +echo "Build extensions" +# GH 47305: Parallel build can causes flaky ImportError from pandas/_libs/tslibs +python setup.py build_ext -q -j1 + +echo "Install pandas" +python -m pip install --no-build-isolation --no-use-pep517 -e . + +echo "done" diff --git a/.devcontainer.json b/.devcontainer.json index 8bea96aea29c1..7c5d009260c64 100644 --- a/.devcontainer.json +++ b/.devcontainer.json @@ -9,8 +9,7 @@ // You can edit these settings after create using File > Preferences > Settings > Remote. "settings": { "terminal.integrated.shell.linux": "/bin/bash", - "python.condaPath": "/opt/conda/bin/conda", - "python.pythonPath": "/opt/conda/bin/python", + "python.pythonPath": "/usr/local/bin/python", "python.formatting.provider": "black", "python.linting.enabled": true, "python.linting.flake8Enabled": true, diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md deleted file mode 100644 index 7dd2e04249492..0000000000000 --- a/.github/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,63 +0,0 @@ -# Contributor Code of Conduct - -As contributors and maintainers of this project, and in the interest of -fostering an open and welcoming community, we pledge to respect all people who -contribute through reporting issues, posting feature requests, updating -documentation, submitting pull requests or patches, and other activities. - -We are committed to making participation in this project a harassment-free -experience for everyone, regardless of level of experience, gender, gender -identity and expression, sexual orientation, disability, personal appearance, -body size, race, ethnicity, age, religion, or nationality. - -Examples of unacceptable behavior by participants include: - -* The use of sexualized language or imagery -* Personal attacks -* Trolling or insulting/derogatory comments -* Public or private harassment -* Publishing other's private information, such as physical or electronic - addresses, without explicit permission -* Other unethical or unprofessional conduct - -Project maintainers have the right and responsibility to remove, edit, or -reject comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct, or to ban temporarily or -permanently any contributor for other behaviors that they deem inappropriate, -threatening, offensive, or harmful. - -By adopting this Code of Conduct, project maintainers commit themselves to -fairly and consistently applying these principles to every aspect of managing -this project. Project maintainers who do not follow or enforce the Code of -Conduct may be permanently removed from the project team. - -This Code of Conduct applies both within project spaces and in public spaces -when an individual is representing the project or its community. - -A working group of community members is committed to promptly addressing any -reported issues. The working group is made up of pandas contributors and users. -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the working group by e-mail (pandas-coc@googlegroups.com). -Messages sent to this e-mail address will not be publicly visible but only to -the working group members. The working group currently includes - -- Safia Abdalla -- Tom Augspurger -- Joris Van den Bossche -- Camille Scott -- Nathaniel Smith - -All complaints will be reviewed and investigated and will result in a response -that is deemed necessary and appropriate to the circumstances. Maintainers are -obligated to maintain confidentiality with regard to the reporter of an -incident. - -This Code of Conduct is adapted from the [Contributor Covenant][homepage], -version 1.3.0, available at -[https://www.contributor-covenant.org/version/1/3/0/][version], -and the [Swift Code of Conduct][swift]. - -[homepage]: https://www.contributor-covenant.org -[version]: https://www.contributor-covenant.org/version/1/3/0/ -[swift]: https://swift.org/community/#code-of-conduct - diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md deleted file mode 100644 index 49200523df40f..0000000000000 --- a/.github/CONTRIBUTING.md +++ /dev/null @@ -1,23 +0,0 @@ -# Contributing to pandas - -Whether you are a novice or experienced software developer, all contributions and suggestions are welcome! - -Our main contributing guide can be found [in this repo](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst) or [on the website](https://pandas.pydata.org/docs/dev/development/contributing.html). If you do not want to read it in its entirety, we will summarize the main ways in which you can contribute and point to relevant sections of that document for further information. - -## Getting Started - -If you are looking to contribute to the *pandas* codebase, the best place to start is the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues). This is also a great place for filing bug reports and making suggestions for ways in which we can improve the code and documentation. - -If you have additional questions, feel free to ask them on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas). Further information can also be found in the "[Where to start?](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst#where-to-start)" section. - -## Filing Issues - -If you notice a bug in the code or documentation, or have suggestions for how we can improve either, feel free to create an issue on the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) using [GitHub's "issue" form](https://github.com/pandas-dev/pandas/issues/new). The form contains some questions that will help us best address your issue. For more information regarding how to file issues against *pandas*, please refer to the "[Bug reports and enhancement requests](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst#bug-reports-and-enhancement-requests)" section. - -## Contributing to the Codebase - -The code is hosted on [GitHub](https://www.github.com/pandas-dev/pandas), so you will need to use [Git](https://git-scm.com/) to clone the project and make changes to the codebase. Once you have obtained a copy of the code, you should create a development environment that is separate from your existing Python environment so that you can make and test changes without compromising your own work environment. For more information, please refer to the "[Working with the code](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst#working-with-the-code)" section. - -Before submitting your changes for review, make sure to check that your changes do not break any tests. You can find more information about our test suites in the "[Test-driven development/code writing](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst#test-driven-development-code-writing)" section. We also have guidelines regarding coding style that will be enforced during testing, which can be found in the "[Code standards](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst#code-standards)" section. - -Once your changes are ready to be submitted, make sure to push your changes to GitHub before creating a pull request. Details about how to do that can be found in the "[Contributing your changes to pandas](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst#contributing-your-changes-to-pandas)" section. We will review your changes, and you will most likely be asked to make additional changes before it is finally ready to merge. However, once it's ready, we will merge it, and you will have successfully contributed to the codebase! diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml deleted file mode 100644 index 27dfded808b95..0000000000000 --- a/.github/FUNDING.yml +++ /dev/null @@ -1,3 +0,0 @@ -custom: https://pandas.pydata.org/donate.html -github: [numfocus] -tidelift: pypi/pandas diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index 765c1b8bff62e..0000000000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,39 +0,0 @@ ---- - -name: Bug Report -about: Create a bug report to help us improve pandas -title: "BUG:" -labels: "Bug, Needs Triage" - ---- - -- [ ] I have checked that this issue has not already been reported. - -- [ ] I have confirmed this bug exists on the latest version of pandas. - -- [ ] (optional) I have confirmed this bug exists on the master branch of pandas. - ---- - -**Note**: Please read [this guide](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your bug. - -#### Code Sample, a copy-pastable example - -```python -# Your code here - -``` - -#### Problem description - -[this should explain **why** the current behaviour is a problem and why the expected output is a better solution] - -#### Expected Output - -#### Output of ``pd.show_versions()`` - -
- -[paste the output of ``pd.show_versions()`` here leaving a blank line after the details tag] - -
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml new file mode 100644 index 0000000000000..4e1bc8f61d04e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yaml @@ -0,0 +1,70 @@ +name: Bug Report +description: Report incorrect behavior in the pandas library +title: "BUG: " +labels: [Bug, Needs Triage] + +body: + - type: checkboxes + id: checks + attributes: + label: Pandas version checks + options: + - label: > + I have checked that this issue has not already been reported. + required: true + - label: > + I have confirmed this bug exists on the + [latest version](https://pandas.pydata.org/docs/whatsnew/index.html) of pandas. + required: true + - label: > + I have confirmed this bug exists on the + [main branch](https://pandas.pydata.org/docs/dev/getting_started/install.html#installing-the-development-version-of-pandas) + of pandas. + - type: textarea + id: example + attributes: + label: Reproducible Example + description: > + Please follow [this guide](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) on how to + provide a minimal, copy-pastable example. + placeholder: > + import pandas as pd + + df = pd.DataFrame(range(5)) + + ... + render: python + validations: + required: true + - type: textarea + id: problem + attributes: + label: Issue Description + description: > + Please provide a description of the issue shown in the reproducible example. + validations: + required: true + - type: textarea + id: expected-behavior + attributes: + label: Expected Behavior + description: > + Please describe or show a code example of the expected behavior. + validations: + required: true + - type: textarea + id: version + attributes: + label: Installed Versions + description: > + Please paste the output of ``pd.show_versions()`` + value: > +
+ + + Replace this line with the output of pd.show_versions() + + +
+ validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/documentation_improvement.md b/.github/ISSUE_TEMPLATE/documentation_improvement.md deleted file mode 100644 index 32d5612767a8c..0000000000000 --- a/.github/ISSUE_TEMPLATE/documentation_improvement.md +++ /dev/null @@ -1,22 +0,0 @@ ---- - -name: Documentation Improvement -about: Report wrong or missing documentation -title: "DOC:" -labels: "Docs, Needs Triage" - ---- - -#### Location of the documentation - -[this should provide the location of the documentation, e.g. "pandas.read_csv" or the URL of the documentation, e.g. "https://dev.pandas.io/docs/reference/api/pandas.read_csv.html"] - -**Note**: You can check the latest versions of the docs on `master` [here](https://pandas.pydata.org/docs/dev/). - -#### Documentation problem - -[this should provide a description of what documentation you believe needs to be fixed/improved] - -#### Suggested fix for documentation - -[this should explain the suggested fix and **why** it's better than the existing documentation] diff --git a/.github/ISSUE_TEMPLATE/documentation_improvement.yaml b/.github/ISSUE_TEMPLATE/documentation_improvement.yaml new file mode 100644 index 0000000000000..b89600f8598e7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation_improvement.yaml @@ -0,0 +1,41 @@ +name: Documentation Improvement +description: Report wrong or missing documentation +title: "DOC: " +labels: [Docs, Needs Triage] + +body: + - type: checkboxes + attributes: + label: Pandas version checks + options: + - label: > + I have checked that the issue still exists on the latest versions of the docs + on `main` [here](https://pandas.pydata.org/docs/dev/) + required: true + - type: textarea + id: location + attributes: + label: Location of the documentation + description: > + Please provide the location of the documentation, e.g. "pandas.read_csv" or the + URL of the documentation, e.g. + "https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html" + placeholder: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html + validations: + required: true + - type: textarea + id: problem + attributes: + label: Documentation problem + description: > + Please provide a description of what documentation you believe needs to be fixed/improved + validations: + required: true + - type: textarea + id: suggested-fix + attributes: + label: Suggested fix for documentation + description: > + Please explain the suggested fix and **why** it's better than the existing documentation + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md deleted file mode 100644 index 0c30b941bc520..0000000000000 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ /dev/null @@ -1,33 +0,0 @@ ---- - -name: Feature Request -about: Suggest an idea for pandas -title: "ENH:" -labels: "Enhancement, Needs Triage" - ---- - -#### Is your feature request related to a problem? - -[this should provide a description of what the problem is, e.g. "I wish I could use pandas to do [...]"] - -#### Describe the solution you'd like - -[this should provide a description of the feature request, e.g. "`DataFrame.foo` should get a new parameter `bar` that [...]", try to write a docstring for the desired feature] - -#### API breaking implications - -[this should provide a description of how this feature will affect the API] - -#### Describe alternatives you've considered - -[this should provide a description of any alternative solutions or features you've considered] - -#### Additional context - -[add any other context, code examples, or references to existing implementations about the feature request here] - -```python -# Your code here, if applicable - -``` diff --git a/.github/ISSUE_TEMPLATE/feature_request.yaml b/.github/ISSUE_TEMPLATE/feature_request.yaml new file mode 100644 index 0000000000000..6e6cd78ace11d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yaml @@ -0,0 +1,72 @@ +name: Feature Request +description: Suggest an idea for pandas +title: "ENH: " +labels: [Enhancement, Needs Triage] + +body: + - type: checkboxes + id: checks + attributes: + label: Feature Type + description: Please check what type of feature request you would like to propose. + options: + - label: > + Adding new functionality to pandas + - label: > + Changing existing functionality in pandas + - label: > + Removing existing functionality in pandas + - type: textarea + id: description + attributes: + label: Problem Description + description: > + Please describe what problem the feature would solve, e.g. "I wish I could use pandas to ..." + placeholder: > + I wish I could use pandas to return a Series from a DataFrame when possible. + validations: + required: true + - type: textarea + id: feature + attributes: + label: Feature Description + description: > + Please describe how the new feature would be implemented, using psudocode if relevant. + placeholder: > + Add a new parameter to DataFrame, to_series, to return a Series if possible. + + def __init__(self, ..., to_series: bool=False): + """ + Parameters + ---------- + ... + + to_series : bool, default False + Return a Series if possible + """ + if to_series: + return Series(data) + validations: + required: true + - type: textarea + id: alternative + attributes: + label: Alternative Solutions + description: > + Please describe any alternative solution (existing functionality, 3rd party package, etc.) + that would satisfy the feature request. + placeholder: > + Write a custom function to return a Series when possible. + + def to_series(...) + result = pd.DataFrame(...) + ... + validations: + required: true + - type: textarea + id: context + attributes: + label: Additional Context + description: > + Please provide any relevant GitHub issues, code examples or references that help describe and support + the feature request. diff --git a/.github/ISSUE_TEMPLATE/installation_issue.yaml b/.github/ISSUE_TEMPLATE/installation_issue.yaml new file mode 100644 index 0000000000000..a80269ff0f12d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/installation_issue.yaml @@ -0,0 +1,66 @@ +name: Installation Issue +description: Report issues installing the pandas library on your system +title: "BUILD: " +labels: [Build, Needs Triage] + +body: + - type: checkboxes + id: checks + attributes: + label: Installation check + options: + - label: > + I have read the [installation guide](https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html#installing-pandas). + required: true + - type: input + id: platform + attributes: + label: Platform + description: > + Please provide the output of ``import platform; print(platform.platform())`` + validations: + required: true + - type: dropdown + id: method + attributes: + label: Installation Method + description: > + Please provide how you tried to install pandas from a clean environment. + options: + - pip install + - conda install + - apt-get install + - Built from source + - Other + validations: + required: true + - type: input + id: pandas + attributes: + label: pandas Version + description: > + Please provide the version of pandas you are trying to install. + validations: + required: true + - type: input + id: python + attributes: + label: Python Version + description: > + Please provide the installed version of Python. + validations: + required: true + - type: textarea + id: logs + attributes: + label: Installation Logs + description: > + If possible, please copy and paste the installation logs when attempting to install pandas. + value: > +
+ + + Replace this line with the installation logs. + + +
diff --git a/.github/ISSUE_TEMPLATE/performance_issue.yaml b/.github/ISSUE_TEMPLATE/performance_issue.yaml new file mode 100644 index 0000000000000..096e012f4ee0f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/performance_issue.yaml @@ -0,0 +1,53 @@ +name: Performance Issue +description: Report slow performance or memory issues when running pandas code +title: "PERF: " +labels: [Performance, Needs Triage] + +body: + - type: checkboxes + id: checks + attributes: + label: Pandas version checks + options: + - label: > + I have checked that this issue has not already been reported. + required: true + - label: > + I have confirmed this issue exists on the + [latest version](https://pandas.pydata.org/docs/whatsnew/index.html) of pandas. + required: true + - label: > + I have confirmed this issue exists on the main branch of pandas. + - type: textarea + id: example + attributes: + label: Reproducible Example + description: > + Please provide a minimal, copy-pastable example that quantifies + [slow runtime](https://docs.python.org/3/library/timeit.html) or + [memory](https://pypi.org/project/memory-profiler/) issues. + validations: + required: true + - type: textarea + id: version + attributes: + label: Installed Versions + description: > + Please paste the output of ``pd.show_versions()`` + value: > +
+ + + Replace this line with the output of pd.show_versions() + + +
+ validations: + required: true + - type: textarea + id: prior-performance + attributes: + label: Prior Performance + description: > + If applicable, please provide the prior version of pandas and output + of the same reproducible example where the performance issue did not exist. diff --git a/.github/ISSUE_TEMPLATE/submit_question.md b/.github/ISSUE_TEMPLATE/submit_question.md deleted file mode 100644 index 9b48918ff2f6d..0000000000000 --- a/.github/ISSUE_TEMPLATE/submit_question.md +++ /dev/null @@ -1,24 +0,0 @@ ---- - -name: Submit Question -about: Ask a general question about pandas -title: "QST:" -labels: "Usage Question, Needs Triage" - ---- - -- [ ] I have searched the [[pandas] tag](https://stackoverflow.com/questions/tagged/pandas) on StackOverflow for similar questions. - -- [ ] I have asked my usage related question on [StackOverflow](https://stackoverflow.com). - ---- - -#### Question about pandas - -**Note**: If you'd still like to submit a question, please read [this guide]( -https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your question. - -```python -# Your code here, if applicable - -``` diff --git a/.github/ISSUE_TEMPLATE/submit_question.yml b/.github/ISSUE_TEMPLATE/submit_question.yml new file mode 100644 index 0000000000000..6f73041b0f527 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/submit_question.yml @@ -0,0 +1,44 @@ +name: Submit Question +description: Ask a general question about pandas +title: "QST: " +labels: [Usage Question, Needs Triage] + +body: + - type: markdown + attributes: + value: > + Since [StackOverflow](https://stackoverflow.com) is better suited towards answering + usage questions, we ask that all usage questions are first asked on StackOverflow. + - type: checkboxes + attributes: + label: Research + options: + - label: > + I have searched the [[pandas] tag](https://stackoverflow.com/questions/tagged/pandas) + on StackOverflow for similar questions. + required: true + - label: > + I have asked my usage related question on [StackOverflow](https://stackoverflow.com). + required: true + - type: input + id: question-link + attributes: + label: Link to question on StackOverflow + validations: + required: true + - type: markdown + attributes: + value: --- + - type: textarea + id: question + attributes: + label: Question about pandas + description: > + **Note**: If you'd still like to submit a question, please read [this guide]( + https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing + how to provide the necessary information for us to reproduce your question. + placeholder: | + ```python + # Your code here, if applicable + + ``` diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7c3870470f074..8eca91c692710 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,5 +1,5 @@ -- [ ] closes #xxxx -- [ ] tests added / passed -- [ ] passes `black pandas` -- [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff` -- [ ] whatsnew entry +- [ ] closes #xxxx (Replace xxxx with the GitHub issue number) +- [ ] [Tests added and passed](https://pandas.pydata.org/pandas-docs/dev/development/contributing_codebase.html#writing-tests) if fixing a bug or adding a new feature +- [ ] All [code checks passed](https://pandas.pydata.org/pandas-docs/dev/development/contributing_codebase.html#pre-commit). +- [ ] Added [type annotations](https://pandas.pydata.org/pandas-docs/dev/development/contributing_codebase.html#type-hints) to new arguments/methods/functions. +- [ ] Added an entry in the latest `doc/source/whatsnew/vX.X.X.rst` file if fixing a bug or adding a new feature. diff --git a/.github/SECURITY.md b/.github/SECURITY.md deleted file mode 100644 index f3b059a5d4f13..0000000000000 --- a/.github/SECURITY.md +++ /dev/null @@ -1 +0,0 @@ -To report a security vulnerability to pandas, please go to https://tidelift.com/security and see the instructions there. diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml new file mode 100644 index 0000000000000..23bb988ef4d73 --- /dev/null +++ b/.github/actions/build_pandas/action.yml @@ -0,0 +1,22 @@ +name: Build pandas +description: Rebuilds the C extensions and installs pandas +runs: + using: composite + steps: + + - name: Environment Detail + run: | + micromamba info + micromamba list + shell: bash -el {0} + + - name: Build Pandas + run: | + python setup.py build_ext -j $N_JOBS + python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index + shell: bash -el {0} + env: + # Cannot use parallel compilation on Windows, see https://github.com/pandas-dev/pandas/issues/30873 + # GH 47305: Parallel build causes flaky ImportError: /home/runner/work/pandas/pandas/pandas/_libs/tslibs/timestamps.cpython-38-x86_64-linux-gnu.so: undefined symbol: pandas_datetime_to_datetimestruct + N_JOBS: 1 + #N_JOBS: ${{ runner.os == 'Windows' && 1 || 2 }} diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml new file mode 100644 index 0000000000000..fd7c3587f2254 --- /dev/null +++ b/.github/actions/run-tests/action.yml @@ -0,0 +1,27 @@ +name: Run tests and report results +runs: + using: composite + steps: + - name: Test + run: ci/run_tests.sh + shell: bash -el {0} + + - name: Publish test results + uses: actions/upload-artifact@v3 + with: + name: Test results + path: test-data.xml + if: failure() + + - name: Report Coverage + run: coverage report -m + shell: bash -el {0} + if: failure() + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + flags: unittests + name: codecov-pandas + fail_ci_if_error: false + if: failure() diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml new file mode 100644 index 0000000000000..8aa417c1d8fd4 --- /dev/null +++ b/.github/actions/setup-conda/action.yml @@ -0,0 +1,25 @@ +name: Set up Conda environment +inputs: + environment-file: + description: Conda environment file to use. + default: environment.yml + environment-name: + description: Name to use for the Conda environment + default: test + extra-specs: + description: Extra packages to install + required: false +runs: + using: composite + steps: + - name: Install ${{ inputs.environment-file }} + uses: mamba-org/provision-with-micromamba@v15 + with: + environment-file: ${{ inputs.environment-file }} + environment-name: ${{ inputs.environment-name }} + extra-specs: ${{ inputs.extra-specs }} + channels: conda-forge + channel-priority: ${{ runner.os == 'macOS' && 'flexible' || 'strict' }} + condarc-file: ci/condarc.yml + cache-env: true + cache-downloads: true diff --git a/.github/workflows/assign.yml b/.github/workflows/assign.yml index a6d3f1f383751..b3331060823a9 100644 --- a/.github/workflows/assign.yml +++ b/.github/workflows/assign.yml @@ -3,12 +3,17 @@ on: issue_comment: types: created +permissions: + contents: read + jobs: - one: - runs-on: ubuntu-latest + issue_assign: + permissions: + issues: write + pull-requests: write + runs-on: ubuntu-22.04 steps: - if: github.event.comment.body == 'take' - name: run: | echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees diff --git a/.github/workflows/autoupdate-pre-commit-config.yml b/.github/workflows/autoupdate-pre-commit-config.yml new file mode 100644 index 0000000000000..376aa8343c571 --- /dev/null +++ b/.github/workflows/autoupdate-pre-commit-config.yml @@ -0,0 +1,39 @@ +name: "Update pre-commit config" + +on: + schedule: + - cron: "0 7 1 * *" # At 07:00 on 1st of every month. + workflow_dispatch: + +permissions: + contents: read + +jobs: + update-pre-commit: + permissions: + contents: write # for technote-space/create-pr-action to push code + pull-requests: write # for technote-space/create-pr-action to create a PR + if: github.repository_owner == 'pandas-dev' + name: Autoupdate pre-commit config + runs-on: ubuntu-22.04 + steps: + - name: Set up Python + uses: actions/setup-python@v4 + - name: Cache multiple paths + uses: actions/cache@v3 + with: + path: | + ~/.cache/pre-commit + ~/.cache/pip + key: pre-commit-autoupdate-${{ runner.os }}-build + - name: Update pre-commit config packages + uses: technote-space/create-pr-action@v2 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + EXECUTE_COMMANDS: | + pip install pre-commit + pre-commit autoupdate || (exit 0); + pre-commit run -a || (exit 0); + COMMIT_MESSAGE: "⬆️ UPGRADE: Autoupdate pre-commit config" + PR_BRANCH_NAME: "pre-commit-config-update-${PR_ID}" + PR_TITLE: "⬆️ UPGRADE: Autoupdate pre-commit config" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index db1fc30111a2d..0000000000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,142 +0,0 @@ -name: CI - -on: - push: - branches: master - pull_request: - branches: master - -env: - ENV_FILE: environment.yml - -jobs: - checks: - name: Checks - runs-on: ubuntu-latest - steps: - - - name: Setting conda path - run: echo "::add-path::${HOME}/miniconda3/bin" - - - name: Checkout - uses: actions/checkout@v1 - - - name: Looking for unwanted patterns - run: ci/code_checks.sh patterns - if: always() - - - name: Setup environment and build pandas - run: ci/setup_env.sh - if: always() - - - name: Linting - run: | - source activate pandas-dev - ci/code_checks.sh lint - if: always() - - - name: Dependencies consistency - run: | - source activate pandas-dev - ci/code_checks.sh dependencies - if: always() - - - name: Checks on imported code - run: | - source activate pandas-dev - ci/code_checks.sh code - if: always() - - - name: Running doctests - run: | - source activate pandas-dev - ci/code_checks.sh doctests - if: always() - - - name: Docstring validation - run: | - source activate pandas-dev - ci/code_checks.sh docstrings - if: always() - - - name: Typing validation - run: | - source activate pandas-dev - ci/code_checks.sh typing - if: always() - - - name: Testing docstring validation script - run: | - source activate pandas-dev - pytest --capture=no --strict scripts - if: always() - - - name: Running benchmarks - run: | - source activate pandas-dev - cd asv_bench - asv check -E existing - git remote add upstream https://github.com/pandas-dev/pandas.git - git fetch upstream - if git diff upstream/master --name-only | grep -q "^asv_bench/"; then - asv machine --yes - asv dev | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log - if grep "failed" benchmarks.log > /dev/null ; then - exit 1 - fi - else - echo "Benchmarks did not run, no changes detected" - fi - if: always() - - - name: Publish benchmarks artifact - uses: actions/upload-artifact@master - with: - name: Benchmarks log - path: asv_bench/benchmarks.log - if: failure() - - web_and_docs: - name: Web and docs - runs-on: ubuntu-latest - steps: - - - name: Setting conda path - run: echo "::set-env name=PATH::${HOME}/miniconda3/bin:${PATH}" - - - name: Checkout - uses: actions/checkout@v1 - - - name: Setup environment and build pandas - run: ci/setup_env.sh - - - name: Build website - run: | - source activate pandas-dev - python web/pandas_web.py web/pandas --target-path=web/build - - - name: Build documentation - run: | - source activate pandas-dev - doc/make.py --warnings-are-errors | tee sphinx.log ; exit ${PIPESTATUS[0]} - - # This can be removed when the ipython directive fails when there are errors, - # including the `tee sphinx.log` in te previous step (https://github.com/ipython/ipython/issues/11547) - - name: Check ipython directive errors - run: "! grep -B1 \"^<<<-------------------------------------------------------------------------$\" sphinx.log" - - - name: Install ssh key - run: | - mkdir -m 700 -p ~/.ssh - echo "${{ secrets.server_ssh_key }}" > ~/.ssh/id_rsa - chmod 600 ~/.ssh/id_rsa - echo "${{ secrets.server_ip }} ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBE1Kkopomm7FHG5enATf7SgnpICZ4W2bw+Ho+afqin+w7sMcrsa0je7sbztFAV8YchDkiBKnWTG4cRT+KZgZCaY=" > ~/.ssh/known_hosts - if: github.event_name == 'push' - - - name: Upload web - run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' --exclude='Pandas_Cheat_Sheet*' web/build/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas - if: github.event_name == 'push' - - - name: Upload dev docs - run: rsync -az --delete doc/build/html/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas/pandas-docs/dev - if: github.event_name == 'push' diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml new file mode 100644 index 0000000000000..9756843350036 --- /dev/null +++ b/.github/workflows/code-checks.yml @@ -0,0 +1,190 @@ +name: Code Checks + +on: + push: + branches: + - main + - 2.0.x + pull_request: + branches: + - main + - 2.0.x + +env: + ENV_FILE: environment.yml + PANDAS_CI: 1 + +permissions: + contents: read + +jobs: + pre_commit: + name: pre-commit + runs-on: ubuntu-22.04 + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-pre-commit + cancel-in-progress: true + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Install Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Run pre-commit + uses: pre-commit/action@v2.0.3 + with: + extra_args: --verbose --all-files + + docstring_typing_manual_hooks: + name: Docstring validation, typing, and other manual pre-commit hooks + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash -el {0} + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-code-checks + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + + - name: Build Pandas + id: build + uses: ./.github/actions/build_pandas + + # The following checks are independent of each other and should still be run if one fails + - name: Check for no warnings when building single-page docs + run: ci/code_checks.sh single-docs + if: ${{ steps.build.outcome == 'success' && always() }} + + - name: Run checks on imported code + run: ci/code_checks.sh code + if: ${{ steps.build.outcome == 'success' && always() }} + + - name: Run doctests + run: ci/code_checks.sh doctests + if: ${{ steps.build.outcome == 'success' && always() }} + + - name: Run docstring validation + run: ci/code_checks.sh docstrings + if: ${{ steps.build.outcome == 'success' && always() }} + + - name: Run check of documentation notebooks + run: ci/code_checks.sh notebooks + if: ${{ steps.build.outcome == 'success' && always() }} + + - name: Use existing environment for type checking + run: | + echo $PATH >> $GITHUB_PATH + echo "PYTHONHOME=$PYTHONHOME" >> $GITHUB_ENV + echo "PYTHONPATH=$PYTHONPATH" >> $GITHUB_ENV + if: ${{ steps.build.outcome == 'success' && always() }} + + - name: Typing + pylint + uses: pre-commit/action@v2.0.3 + with: + extra_args: --verbose --hook-stage manual --all-files + if: ${{ steps.build.outcome == 'success' && always() }} + + - name: Run docstring validation script tests + run: pytest scripts + if: ${{ steps.build.outcome == 'success' && always() }} + + asv-benchmarks: + name: ASV Benchmarks + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash -el {0} + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-asv-benchmarks + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + + - name: Build Pandas + id: build + uses: ./.github/actions/build_pandas + + - name: Run ASV benchmarks + run: | + cd asv_bench + asv machine --yes + asv run --quick --dry-run --strict --durations=30 --python=same + + build_docker_dev_environment: + name: Build Docker Dev Environment + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash -el {0} + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-build_docker_dev_environment + cancel-in-progress: true + + steps: + - name: Clean up dangling images + run: docker image prune -f + + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Build image + run: docker build --pull --no-cache --tag pandas-dev-env . + + - name: Show environment + run: docker run --rm pandas-dev-env python -c "import pandas as pd; print(pd.show_versions())" + + requirements-dev-text-installable: + name: Test install requirements-dev.txt + runs-on: ubuntu-22.04 + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-requirements-dev-text-installable + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Setup Python + id: setup_python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + cache: 'pip' + cache-dependency-path: 'requirements-dev.txt' + + - name: Install requirements-dev.txt + run: pip install -r requirements-dev.txt + + - name: Check Pip Cache Hit + run: echo ${{ steps.setup_python.outputs.cache-hit }} diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000000000..23609f692df7c --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,34 @@ +name: CodeQL +on: + schedule: + # every day at midnight + - cron: "0 0 * * *" + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + analyze: + runs-on: ubuntu-22.04 + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: + - python + + steps: + - uses: actions/checkout@v3 + - uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + - uses: github/codeql-action/autobuild@v2 + - uses: github/codeql-action/analyze@v2 diff --git a/.github/workflows/comment-commands.yml b/.github/workflows/comment-commands.yml new file mode 100644 index 0000000000000..2550d4de34a45 --- /dev/null +++ b/.github/workflows/comment-commands.yml @@ -0,0 +1,93 @@ +name: Comment Commands +on: + issue_comment: + types: created + +permissions: + contents: read + issues: write + pull-requests: write + +jobs: + issue_assign: + runs-on: ubuntu-22.04 + if: (!github.event.issue.pull_request) && github.event.comment.body == 'take' + concurrency: + group: ${{ github.actor }}-issue-assign + steps: + - run: | + echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees + preview_docs: + runs-on: ubuntu-22.04 + if: github.event.issue.pull_request && github.event.comment.body == '/preview' + concurrency: + group: ${{ github.actor }}-preview-docs + steps: + - run: | + if curl --output /dev/null --silent --head --fail "https://pandas.pydata.org/preview/$%7B%7B%20github.event.issue.number%20%7D%7D/"; then + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"body": "Website preview of this PR available at: https://pandas.pydata.org/preview/${{ github.event.issue.number }}/"}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/comments + else + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"body": "No preview found for PR #${{ github.event.issue.number }}. Did the docs build complete?"}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/comments + fi + asv_run: + runs-on: ubuntu-22.04 + # TODO: Support more benchmarking options later, against different branches, against self, etc + if: github.event.issue.pull_request && startsWith(github.event.comment.body, '@github-actions benchmark') + defaults: + run: + shell: bash -el {0} + env: + ENV_FILE: environment.yml + COMMENT: ${{github.event.comment.body}} + + concurrency: + # Set concurrency to prevent abuse(full runs are ~5.5 hours !!!) + # each user can only run one concurrent benchmark bot at a time + # We don't cancel in progress jobs, but if you want to benchmark multiple PRs, you're gonna have + # to wait + group: ${{ github.actor }}-asv + cancel-in-progress: false + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + # Although asv sets up its own env, deps are still needed + # during discovery process + - name: Set up Conda + uses: ./.github/actions/setup-conda + + - name: Run benchmarks + id: bench + continue-on-error: true # asv will exit code 1 for regressions + run: | + # extracting the regex, see https://stackoverflow.com/a/36798723 + REGEX=$(echo "$COMMENT" | sed -n "s/^.*-b\s*\(\S*\).*$/\1/p") + cd asv_bench + asv check -E existing + git remote add upstream https://github.com/pandas-dev/pandas.git + git fetch upstream + asv machine --yes + asv continuous -f 1.1 -b $REGEX upstream/main HEAD + echo 'BENCH_OUTPUT<> $GITHUB_ENV + asv compare -f 1.1 upstream/main HEAD >> $GITHUB_ENV + echo 'EOF' >> $GITHUB_ENV + echo "REGEX=$REGEX" >> $GITHUB_ENV + + - uses: actions/github-script@v6 + env: + BENCH_OUTPUT: ${{env.BENCH_OUTPUT}} + REGEX: ${{env.REGEX}} + with: + script: | + const ENV_VARS = process.env + const run_url = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}` + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: '\nBenchmarks completed. View runner logs here.' + run_url + '\nRegex used: '+ 'regex ' + ENV_VARS["REGEX"] + '\n' + ENV_VARS["BENCH_OUTPUT"] + }) diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml new file mode 100644 index 0000000000000..d97e135da3703 --- /dev/null +++ b/.github/workflows/docbuild-and-upload.yml @@ -0,0 +1,89 @@ +name: Doc Build and Upload + +on: + push: + branches: + - main + - 2.0.x + tags: + - '*' + pull_request: + branches: + - main + - 2.0.x + +env: + ENV_FILE: environment.yml + PANDAS_CI: 1 + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + +permissions: + contents: read + +jobs: + web_and_docs: + name: Doc Build and Upload + runs-on: ubuntu-22.04 + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-web-docs + cancel-in-progress: true + + defaults: + run: + shell: bash -el {0} + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + + - name: Build Pandas + uses: ./.github/actions/build_pandas + + - name: Build website + run: python web/pandas_web.py web/pandas --target-path=web/build + + - name: Build documentation + run: doc/make.py --warnings-are-errors + + - name: Build documentation zip + run: doc/make.py zip_html + + - name: Install ssh key + run: | + mkdir -m 700 -p ~/.ssh + echo "${{ secrets.server_ssh_key }}" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + echo "${{ secrets.server_ip }} ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBFjYkJBk7sos+r7yATODogQc3jUdW1aascGpyOD4bohj8dWjzwLJv/OJ/fyOQ5lmj81WKDk67tGtqNJYGL9acII=" > ~/.ssh/known_hosts + if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')) + + - name: Copy cheatsheets into site directory + run: cp doc/cheatsheet/Pandas_Cheat_Sheet* web/build/ + + - name: Upload web + run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' web/build/ web@${{ secrets.server_ip }}:/var/www/html + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + + - name: Upload dev docs + run: rsync -az --delete doc/build/html/ web@${{ secrets.server_ip }}:/var/www/html/pandas-docs/dev + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + + - name: Upload prod docs + run: rsync -az --delete doc/build/html/ web@${{ secrets.server_ip }}:/var/www/html/pandas-docs/version/${GITHUB_REF_NAME:1} + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + + - name: Move docs into site directory + run: mv doc/build/html web/build/docs + + - name: Save website as an artifact + uses: actions/upload-artifact@v3 + with: + name: website + path: web/build + retention-days: 14 diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml new file mode 100644 index 0000000000000..6f1fa771a7854 --- /dev/null +++ b/.github/workflows/package-checks.yml @@ -0,0 +1,86 @@ +name: Package Checks + +on: + push: + branches: + - main + - 2.0.x + pull_request: + branches: + - main + - 2.0.x + types: [ labeled, opened, synchronize, reopened ] + +permissions: + contents: read + +defaults: + run: + shell: bash -el {0} + +jobs: + pip: + if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} + runs-on: ubuntu-22.04 + strategy: + matrix: + extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output_formatting", "clipboard", "compression", "all"] + fail-fast: false + name: Install Extras - ${{ matrix.extra }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-pip-extras-${{ matrix.extra }} + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Setup Python + id: setup_python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + + - name: Install required dependencies + run: | + python -m pip install --upgrade pip setuptools wheel python-dateutil pytz numpy cython + python -m pip install versioneer[toml] + + - name: Pip install with extra + run: python -m pip install -e .[${{ matrix.extra }}] --no-build-isolation + conda_forge_recipe: + if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} + runs-on: ubuntu-22.04 + strategy: + matrix: + python-version: ['3.9', '3.10', '3.11'] + fail-fast: false + name: Test Conda Forge Recipe - Python ${{ matrix.python-version }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-conda-forge-recipe-${{ matrix.python-version }} + cancel-in-progress: true + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python + uses: mamba-org/provision-with-micromamba@v15 + with: + environment-file: false + environment-name: recipe-test + extra-specs: | + python=${{ matrix.python-version }} + boa + conda-verify + channels: conda-forge + cache-downloads: true + cache-env: true + + - name: Build conda package + run: conda mambabuild ci --no-anaconda-upload --verify --strict-verify --output --output-folder . diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml new file mode 100644 index 0000000000000..c47745e097d17 --- /dev/null +++ b/.github/workflows/stale-pr.yml @@ -0,0 +1,26 @@ +name: "Stale PRs" +on: + schedule: + # * is a special character in YAML so you have to quote this string + - cron: "0 0 * * *" + +permissions: + contents: read + +jobs: + stale: + permissions: + pull-requests: write + runs-on: ubuntu-22.04 + steps: + - uses: actions/stale@v4 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity. Please [update](https://pandas.pydata.org/pandas-docs/stable/development/contributing.html#updating-your-pull-request) and respond to this comment if you're still interested in working on this." + stale-pr-label: "Stale" + exempt-pr-labels: "Needs Review,Blocked,Needs Discussion" + days-before-issue-stale: -1 + days-before-pr-stale: 30 + days-before-close: -1 + remove-stale-when-updated: false + debug-only: false diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml new file mode 100644 index 0000000000000..31e2095624347 --- /dev/null +++ b/.github/workflows/unit-tests.yml @@ -0,0 +1,315 @@ +name: Unit Tests + +on: + push: + branches: + - main + - 2.0.x + pull_request: + branches: + - main + - 2.0.x + paths-ignore: + - "doc/**" + - "web/**" + +permissions: + contents: read + +defaults: + run: + shell: bash -el {0} + +jobs: + ubuntu: + runs-on: ubuntu-22.04 + timeout-minutes: 180 + strategy: + matrix: + env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] + # Prevent the include jobs from overriding other jobs + pattern: [""] + include: + - name: "Downstream Compat" + env_file: actions-38-downstream_compat.yaml + pattern: "not slow and not network and not single_cpu" + pytest_target: "pandas/tests/test_downstream.py" + - name: "Minimum Versions" + env_file: actions-38-minimum_versions.yaml + pattern: "not slow and not network and not single_cpu" + - name: "Locale: it_IT" + env_file: actions-38.yaml + pattern: "not slow and not network and not single_cpu" + extra_apt: "language-pack-it" + # Use the utf8 version as the default, it has no bad side-effect. + lang: "it_IT.utf8" + lc_all: "it_IT.utf8" + # Also install it_IT (its encoding is ISO8859-1) but do not activate it. + # It will be temporarily activated during tests with locale.setlocale + extra_loc: "it_IT" + - name: "Locale: zh_CN" + env_file: actions-38.yaml + pattern: "not slow and not network and not single_cpu" + extra_apt: "language-pack-zh-hans" + # Use the utf8 version as the default, it has no bad side-effect. + lang: "zh_CN.utf8" + lc_all: "zh_CN.utf8" + # Also install zh_CN (its encoding is gb2312) but do not activate it. + # It will be temporarily activated during tests with locale.setlocale + extra_loc: "zh_CN" + - name: "Copy-on-Write" + env_file: actions-310.yaml + pattern: "not slow and not network and not single_cpu" + pandas_copy_on_write: "1" + - name: "Pypy" + env_file: actions-pypy-38.yaml + pattern: "not slow and not network and not single_cpu" + test_args: "--max-worker-restart 0" + - name: "Numpy Dev" + env_file: actions-310-numpydev.yaml + pattern: "not slow and not network and not single_cpu" + test_args: "-W error::DeprecationWarning -W error::FutureWarning" + # TODO(cython3): Re-enable once next-beta(after beta 1) comes out + # There are some warnings failing the build with -werror + pandas_ci: "0" + - name: "Pyarrow Nightly" + env_file: actions-311-pyarrownightly.yaml + pattern: "not slow and not network and not single_cpu" + fail-fast: false + name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} + env: + ENV_FILE: ci/deps/${{ matrix.env_file }} + PATTERN: ${{ matrix.pattern }} + EXTRA_APT: ${{ matrix.extra_apt || '' }} + LANG: ${{ matrix.lang || '' }} + LC_ALL: ${{ matrix.lc_all || '' }} + PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} + PANDAS_CI: ${{ matrix.pandas_ci || '1' }} + TEST_ARGS: ${{ matrix.test_args || '' }} + PYTEST_WORKERS: 'auto' + PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }} + cancel-in-progress: true + + services: + mysql: + image: mysql + env: + MYSQL_ALLOW_EMPTY_PASSWORD: yes + MYSQL_DATABASE: pandas + options: >- + --health-cmd "mysqladmin ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 3306:3306 + + postgres: + image: postgres + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: pandas + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + moto: + image: motoserver/moto:4.1.4 + env: + AWS_ACCESS_KEY_ID: foobar_key + AWS_SECRET_ACCESS_KEY: foobar_secret + ports: + - 5000:5000 + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Extra installs + # xsel for clipboard tests + run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }} + + - name: Generate extra locales + # These extra locales will be available for locale.setlocale() calls in tests + run: | + sudo locale-gen ${{ matrix.extra_loc }} + if: ${{ matrix.extra_loc }} + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: ${{ env.ENV_FILE }} + + - name: Build Pandas + id: build + uses: ./.github/actions/build_pandas + + - name: Test (not single_cpu) + uses: ./.github/actions/run-tests + if: ${{ matrix.name != 'Pypy' }} + env: + # Set pattern to not single_cpu if not already set + PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }} + + - name: Test (single_cpu) + uses: ./.github/actions/run-tests + env: + PATTERN: 'single_cpu' + PYTEST_WORKERS: 1 + if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}} + + macos-windows: + timeout-minutes: 180 + strategy: + matrix: + os: [macos-latest, windows-latest] + env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] + fail-fast: false + runs-on: ${{ matrix.os }} + name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }} + cancel-in-progress: true + env: + PANDAS_CI: 1 + PYTEST_TARGET: pandas + PATTERN: "not slow and not db and not network and not single_cpu" + # GH 47443: PYTEST_WORKERS > 1 crashes Windows builds with memory related errors + PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '1' }} + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: ci/deps/${{ matrix.env_file }} + + - name: Build Pandas + uses: ./.github/actions/build_pandas + + - name: Test + uses: ./.github/actions/run-tests + + Linux-32-bit: + runs-on: ubuntu-22.04 + container: + image: quay.io/pypa/manylinux2014_i686 + options: --platform linux/386 + steps: + - name: Checkout pandas Repo + # actions/checkout does not work since it requires node + run: | + git config --global --add safe.directory $PWD + + if [ $GITHUB_EVENT_NAME != pull_request ]; then + git clone --recursive --branch=$GITHUB_REF_NAME https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE + git reset --hard $GITHUB_SHA + else + git clone --recursive https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE + git fetch origin $GITHUB_REF:my_ref_name + git checkout $GITHUB_BASE_REF + git -c user.email="you@example.com" merge --no-commit my_ref_name + fi + - name: Build environment and Run Tests + run: | + /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev + . ~/virtualenvs/pandas-dev/bin/activate + python -m pip install --no-cache-dir --no-deps -U pip wheel setuptools + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python setup.py build_ext -q -j$(nproc) + python -m pip install --no-cache-dir --no-build-isolation --no-use-pep517 -e . + python -m pip list + export PANDAS_CI=1 + python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-32bit + cancel-in-progress: true + + python-dev: + # This job may or may not run depending on the state of the next + # unreleased Python version. DO NOT DELETE IT. + # + # In general, this will remain frozen(present, but not running) until: + # - The next unreleased Python version has released beta 1 + # - This version should be available on GitHub Actions. + # - Our required build/runtime dependencies(numpy, pytz, Cython, python-dateutil) + # support that unreleased Python version. + # To unfreeze, comment out the ``if: false`` condition, and make sure you update + # the name of the workflow and Python version in actions/setup-python ``python-version:`` + # + # After it has been unfrozen, this file should remain unfrozen(present, and running) until: + # - The next Python version has been officially released. + # OR + # - Most/All of our optional dependencies support the next Python version AND + # - The next Python version has released a rc(we are guaranteed a stable ABI). + # To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs + # to the corresponding posix/windows-macos/sdist etc. workflows. + # Feel free to modify this comment as necessary. + if: false # Uncomment this to freeze the workflow, comment it to unfreeze + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-22.04, macOS-latest, windows-latest] + + timeout-minutes: 180 + + concurrency: + #https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev + cancel-in-progress: true + + env: + PYTEST_WORKERS: "auto" + PANDAS_CI: 1 + PATTERN: "not slow and not network and not clipboard and not single_cpu" + COVERAGE: true + PYTEST_TARGET: pandas + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python Dev Version + uses: actions/setup-python@v4 + with: + python-version: '3.11-dev' + + - name: Install dependencies + run: | + python --version + python -m pip install --upgrade pip setuptools wheel + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy + python -m pip install git+https://github.com/nedbat/coveragepy.git + python -m pip install versioneer[toml] + python -m pip install python-dateutil pytz cython hypothesis>=6.46.1 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 + python -m pip list + + - name: Build Pandas + run: | + python setup.py build_ext -q -j4 + python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index + + - name: Build Version + run: | + python -c "import pandas; pandas.show_versions();" + + - name: Test + uses: ./.github/actions/run-tests diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml new file mode 100644 index 0000000000000..c9e44fae43669 --- /dev/null +++ b/.github/workflows/wheels.yml @@ -0,0 +1,207 @@ +# Workflow to build wheels for upload to PyPI. +# Inspired by numpy's cibuildwheel config https://github.com/numpy/numpy/blob/main/.github/workflows/wheels.yml +# +# In an attempt to save CI resources, wheel builds do +# not run on each push but only weekly and for releases. +# Wheel builds can be triggered from the Actions page +# (if you have the perms) on a commit to master. +# +# Alternatively, you can add labels to the pull request in order to trigger wheel +# builds. +# The label(s) that trigger builds are: +# - Build +name: Wheel builder + +on: + schedule: + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + # │ │ │ │ │ + - cron: "27 3 */1 * *" + push: + pull_request: + types: [labeled, opened, synchronize, reopened] + paths-ignore: + - "doc/**" + - "web/**" + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + build_wheels: + name: Build wheel for ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} + if: >- + github.event_name == 'schedule' || + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request' && + contains(github.event.pull_request.labels.*.name, 'Build')) || + (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) + runs-on: ${{ matrix.buildplat[0] }} + strategy: + # Ensure that a wheel builder finishes even if another fails + fail-fast: false + matrix: + # GitHub Actions doesn't support pairing matrix values together, let's improvise + # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026 + buildplat: + - [ubuntu-20.04, manylinux_x86_64] + - [macos-11, macosx_*] + - [windows-2019, win_amd64] + - [windows-2019, win32] + # TODO: support PyPy? + python: [["cp38", "3.8"], ["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"]]# "pp38", "pp39"] + env: + IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} + IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} + steps: + - name: Checkout pandas + uses: actions/checkout@v3 + with: + submodules: true + # versioneer.py requires the latest tag to be reachable. Here we + # fetch the complete history to get access to the tags. + # A shallow clone can work when the following issue is resolved: + # https://github.com/actions/checkout/issues/338 + fetch-depth: 0 + + - name: Build wheels + uses: pypa/cibuildwheel@v2.12.3 + env: + CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} + + # Used to test(Windows-only) and push the built wheels + # You might need to use setup-python separately + # if the new Python-dev version + # is unavailable on conda-forge. + - uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: ${{ matrix.python[1] }} + activate-environment: test + channels: conda-forge, anaconda + channel-priority: true + # mamba fails to solve, also we really don't need this since we're just installing python + # mamba-version: "*" + + - name: Test wheels (Windows 64-bit only) + if: ${{ matrix.buildplat[1] == 'win_amd64' }} + shell: cmd /C CALL {0} + run: | + python ci/test_wheels.py wheelhouse + + - uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.python[0] }}-${{ startsWith(matrix.buildplat[1], 'macosx') && 'macosx' || matrix.buildplat[1] }} + path: ./wheelhouse/*.whl + + + - name: Install anaconda client + if: ${{ success() && (env.IS_SCHEDULE_DISPATCH == 'true' || env.IS_PUSH == 'true') }} + shell: bash -el {0} + run: conda install -q -y anaconda-client + + + - name: Upload wheels + if: ${{ success() && (env.IS_SCHEDULE_DISPATCH == 'true' || env.IS_PUSH == 'true') }} + shell: bash -el {0} + env: + PANDAS_STAGING_UPLOAD_TOKEN: ${{ secrets.PANDAS_STAGING_UPLOAD_TOKEN }} + PANDAS_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.PANDAS_NIGHTLY_UPLOAD_TOKEN }} + run: | + source ci/upload_wheels.sh + set_upload_vars + # trigger an upload to + # https://anaconda.org/scipy-wheels-nightly/pandas + # for cron jobs or "Run workflow" (restricted to main branch). + # Tags will upload to + # https://anaconda.org/multibuild-wheels-staging/pandas + # The tokens were originally generated at anaconda.org + upload_wheels + build_sdist: + name: Build sdist + if: >- + github.event_name == 'schedule' || + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request' && + contains(github.event.pull_request.labels.*.name, 'Build')) || + (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) + runs-on: ubuntu-22.04 + env: + IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} + IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} + steps: + - name: Checkout pandas + uses: actions/checkout@v3 + with: + submodules: true + # versioneer.py requires the latest tag to be reachable. Here we + # fetch the complete history to get access to the tags. + # A shallow clone can work when the following issue is resolved: + # https://github.com/actions/checkout/issues/338 + fetch-depth: 0 + + # Used to push the built sdist + - uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + # Really doesn't matter what version we upload with + # just the version we test with + python-version: '3.8' + channels: conda-forge + channel-priority: true + # mamba fails to solve, also we really don't need this since we're just installing python + # mamba-version: "*" + + - name: Build sdist + run: | + pip install build + python -m build --sdist + - name: Test the sdist + shell: bash -el {0} + run: | + # TODO: Don't run test suite, and instead build wheels from sdist + # by splitting the wheel builders into a two stage job + # (1. Generate sdist 2. Build wheels from sdist) + # This tests the sdists, and saves some build time + python -m pip install dist/*.gz + pip install hypothesis>=6.34.2 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 + cd .. # Not a good idea to test within the src tree + python -c "import pandas; print(pandas.__version__); + pandas.test(extra_args=['-m not clipboard and not single_cpu', '--skip-slow', '--skip-network', '--skip-db', '-n=2']); + pandas.test(extra_args=['-m not clipboard and single_cpu', '--skip-slow', '--skip-network', '--skip-db'])" + - uses: actions/upload-artifact@v3 + with: + name: sdist + path: ./dist/* + + - name: Install anaconda client + if: ${{ success() && (env.IS_SCHEDULE_DISPATCH == 'true' || env.IS_PUSH == 'true') }} + shell: bash -el {0} + run: | + conda install -q -y anaconda-client + + - name: Upload sdist + if: ${{ success() && (env.IS_SCHEDULE_DISPATCH == 'true' || env.IS_PUSH == 'true') }} + shell: bash -el {0} + env: + PANDAS_STAGING_UPLOAD_TOKEN: ${{ secrets.PANDAS_STAGING_UPLOAD_TOKEN }} + PANDAS_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.PANDAS_NIGHTLY_UPLOAD_TOKEN }} + run: | + source ci/upload_wheels.sh + set_upload_vars + # trigger an upload to + # https://anaconda.org/scipy-wheels-nightly/pandas + # for cron jobs or "Run workflow" (restricted to main branch). + # Tags will upload to + # https://anaconda.org/multibuild-wheels-staging/pandas + # The tokens were originally generated at anaconda.org + upload_wheels diff --git a/.gitignore b/.gitignore index 6c3c275c48fb7..07b1f056d511b 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ *.log *.swp *.pdb +*.zip .project .pydevproject .settings @@ -49,6 +50,8 @@ dist *.egg-info .eggs .pypirc +# type checkers +pandas/py.typed # tox testing tool .tox @@ -103,13 +106,14 @@ asv_bench/env/ asv_bench/html/ asv_bench/results/ asv_bench/pandas/ +test-data.xml # Documentation generated files # ################################# doc/source/generated doc/source/user_guide/styled.xlsx doc/source/reference/api -doc/source/_static +doc/source/_static/*.html doc/source/vbench doc/source/vbench.rst doc/source/index.rst @@ -118,3 +122,7 @@ doc/build/html/index.html doc/tmp.sv env/ doc/source/savefig/ + +# Interactive terminal generated files # +######################################## +.jupyterlite.doit.db diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 0000000000000..877c16eefb5d6 --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,59 @@ +# Building pandas on init +# Might delegate this later to prebuild with Q2 improvements on gitpod +# https://www.gitpod.io/docs/config-start-tasks/#configuring-the-terminal +# ------------------------------------------------------------------------- + +# assuming we use dockerhub: name of the docker user, docker image, tag, e.g. https://hub.docker.com/r/pandas/pandas-gitpod/tags +image: pythonpandas/pandas-gitpod:latest +tasks: + - name: Prepare development environment + init: | + mkdir -p .vscode + cp gitpod/settings.json .vscode/settings.json + conda activate pandas-dev + git pull --unshallow # need to force this else the prebuild fails + git fetch --tags + python setup.py build_ext -j 4 + python -m pip install -e . --no-build-isolation + echo "🛠 Completed rebuilding Pandas!! 🛠 " + echo "✨ Pre-build complete! You can close this terminal ✨ " + +# -------------------------------------------------------- +# exposing ports for liveserve +ports: + - port: 5500 + onOpen: notify + +# -------------------------------------------------------- +# some useful extensions to have +vscode: + extensions: + - ms-python.python + - yzhang.markdown-all-in-one + - eamodio.gitlens + - lextudio.restructuredtext + - ritwickdey.liveserver + # add or remove what you think is generally useful to most contributors + # avoid adding too many. they each open a pop-up window + +# -------------------------------------------------------- +# using prebuilds for the container +# With this configuration the prebuild will happen on push to main +github: + prebuilds: + # enable for main/default branch + main: true + # enable for other branches (defaults to false) + branches: false + # enable for pull requests coming from this repo (defaults to true) + pullRequests: false + # enable for pull requests coming from forks (defaults to false) + pullRequestsFromForks: false + # add a check to pull requests (defaults to true) + addCheck: false + # add a "Review in Gitpod" button as a comment to pull requests (defaults to false) + addComment: false + # add a "Review in Gitpod" button to the pull request's description (defaults to false) + addBadge: false + # add a label once the prebuild is ready to pull requests (defaults to false) + addLabel: false diff --git a/.libcst.codemod.yaml b/.libcst.codemod.yaml new file mode 100644 index 0000000000000..985e69b772e52 --- /dev/null +++ b/.libcst.codemod.yaml @@ -0,0 +1,18 @@ +# String that LibCST should look for in code which indicates that the +# module is generated code. +generated_code_marker: '@generated' +# Command line and arguments for invoking a code formatter. Anything +# specified here must be capable of taking code via stdin and returning +# formatted code via stdout. +formatter: ['black', '-'] +# List of regex patterns which LibCST will evaluate against filenames to +# determine if the module should be touched. +blacklist_patterns: [] +# List of modules that contain codemods inside of them. +modules: +- 'libcst.codemod.commands' +- 'autotyping' +# Absolute or relative path of the repository root, used for providing +# full-repo metadata. Relative paths should be specified with this file +# location as the base. +repo_root: '.' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b7fd797fb7230..f369fcabe3f01 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,44 +1,442 @@ +minimum_pre_commit_version: 2.15.0 +exclude: ^LICENSES/|\.(html|csv|svg)$ +# reserve "manual" for relatively slow hooks which we still want to run in CI +default_stages: [ + commit, + merge-commit, + push, + prepare-commit-msg, + commit-msg, + post-checkout, + post-commit, + post-merge, + post-rewrite +] +ci: + autofix_prs: false repos: -- repo: https://github.com/python/black - rev: 19.10b0 +- repo: local hooks: + # NOTE: we make `black` a local hook because if it's installed from + # PyPI (rather than from source) then it'll run twice as fast thanks to mypyc - id: black - language_version: python3 -- repo: https://gitlab.com/pycqa/flake8 - rev: 3.7.7 - hooks: - - id: flake8 - language: python_venv - additional_dependencies: [flake8-comprehensions>=3.1.0] - - id: flake8 - name: flake8-pyx - language: python_venv - files: \.(pyx|pxd)$ - types: - - file - args: [--append-config=flake8/cython.cfg] - - id: flake8 - name: flake8-pxd - language: python_venv - files: \.pxi\.in$ - types: - - file - args: [--append-config=flake8/cython-template.cfg] -- repo: https://github.com/pre-commit/mirrors-isort - rev: v4.3.21 + name: black + description: "Black: The uncompromising Python code formatter" + entry: black + language: python + require_serial: true + types_or: [python, pyi] + additional_dependencies: [black==23.1.0] +- repo: https://github.com/charliermarsh/ruff-pre-commit + rev: v0.0.244 + hooks: + - id: ruff +- repo: https://github.com/jendrikseipp/vulture + rev: 'v2.7' + hooks: + - id: vulture + entry: python scripts/run_vulture.py + pass_filenames: true + require_serial: false +- repo: https://github.com/codespell-project/codespell + rev: v2.2.2 + hooks: + - id: codespell + types_or: [python, rst, markdown] + additional_dependencies: [tomli] +- repo: https://github.com/MarcoGorelli/cython-lint + rev: v0.12.4 + hooks: + - id: cython-lint + - id: double-quote-cython-strings +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: debug-statements + - id: end-of-file-fixer + exclude: \.txt$ + stages: [commit, merge-commit, push, prepare-commit-msg, commit-msg, + post-checkout, post-commit, post-merge, post-rewrite] + - id: trailing-whitespace + stages: [commit, merge-commit, push, prepare-commit-msg, commit-msg, + post-checkout, post-commit, post-merge, post-rewrite] +- repo: https://github.com/cpplint/cpplint + rev: 1.6.1 + hooks: + - id: cpplint + # We don't lint all C files because we don't want to lint any that are built + # from Cython files nor do we want to lint C files that we didn't modify for + # this particular codebase (e.g. src/headers, src/klib). However, + # we can lint all header files since they aren't "generated" like C files are. + exclude: ^pandas/_libs/src/(klib|headers)/ + args: [ + --quiet, + '--extensions=c,h', + '--headers=h', + --recursive, + --linelength=88, + '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size' + ] +- repo: https://github.com/pycqa/pylint + rev: v2.16.1 + hooks: + - id: pylint + stages: [manual] +- repo: https://github.com/pycqa/pylint + rev: v2.16.1 + hooks: + - id: pylint + alias: redefined-outer-name + name: Redefining name from outer scope + files: ^pandas/ + exclude: | + (?x) + ^pandas/tests # keep excluded + |/_testing/ # keep excluded + |^pandas/util/_test_decorators\.py # keep excluded + |^pandas/_version\.py # keep excluded + |^pandas/conftest\.py # keep excluded + args: [--disable=all, --enable=redefined-outer-name] + stages: [manual] +- repo: https://github.com/PyCQA/isort + rev: 5.12.0 hooks: - id: isort - language: python_venv - exclude: ^pandas/__init__\.py$|^pandas/core/api\.py$ -- repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.730 - hooks: - - id: mypy - args: - # As long as a some files are excluded from check-untyped-defs - # we have to exclude it from the pre-commit hook as the configuration - # is based on modules but the hook runs on files. - - --no-check-untyped-defs - - --follow-imports - - skip - files: pandas/ +- repo: https://github.com/asottile/pyupgrade + rev: v3.3.1 + hooks: + - id: pyupgrade + args: [--py38-plus] +- repo: https://github.com/pre-commit/pygrep-hooks + rev: v1.10.0 + hooks: + - id: rst-backticks + - id: rst-directive-colons + types: [text] # overwrite types: [rst] + types_or: [python, rst] + - id: rst-inline-touching-normal + types: [text] # overwrite types: [rst] + types_or: [python, rst] +- repo: https://github.com/sphinx-contrib/sphinx-lint + rev: v0.6.7 + hooks: + - id: sphinx-lint +- repo: local + hooks: + - id: pyright + # note: assumes python env is setup and activated + name: pyright + entry: pyright + language: node + pass_filenames: false + types: [python] + stages: [manual] + additional_dependencies: &pyright_dependencies + - pyright@1.1.292 + - id: pyright_reportGeneralTypeIssues + # note: assumes python env is setup and activated + name: pyright reportGeneralTypeIssues + entry: pyright --skipunannotated -p pyright_reportGeneralTypeIssues.json --level warning + language: node + pass_filenames: false + types: [python] + stages: [manual] + additional_dependencies: *pyright_dependencies + - id: mypy + # note: assumes python env is setup and activated + name: mypy + entry: mypy + language: system + pass_filenames: false + types: [python] + stages: [manual] + - id: stubtest + # note: assumes python env is setup and activated + # note: requires pandas dev to be installed + name: mypy (stubtest) + entry: python + language: system + pass_filenames: false + types: [pyi] + args: [scripts/run_stubtest.py] + stages: [manual] + - id: flake8-rst + name: flake8-rst + description: Run flake8 on code snippets in docstrings or RST files + language: python + entry: flake8-rst + types: [rst] + args: [--filename=*.rst] + additional_dependencies: [flake8-rst==0.7.0, flake8==3.7.9] + - id: inconsistent-namespace-usage + name: 'Check for inconsistent use of pandas namespace' + entry: python scripts/check_for_inconsistent_pandas_namespace.py + exclude: ^pandas/core/interchange/ + language: python + types: [python] + - id: no-os-remove + name: Check code for instances of os.remove + entry: os\.remove + language: pygrep + types: [python] + files: ^pandas/tests/ + exclude: | + (?x)^ + pandas/tests/io/pytables/test_store\.py$ + - id: unwanted-patterns + name: Unwanted patterns + language: pygrep + entry: | + (?x) + # outdated annotation syntax, missing error codes + \#\ type:\ (?!ignore) + |\#\ type:\s?ignore(?!\[) + + # foo._class__ instead of type(foo) + |\.__class__ + + # np.bool/np.object instead of np.bool_/np.object_ + |np\.bool[^_8`] + |np\.object[^_8`] + + # imports from collections.abc instead of `from collections import abc` + |from\ collections\.abc\ import + + # Numpy + |from\ numpy\ import\ random + |from\ numpy\.random\ import + + # Incorrect code-block / IPython directives + |\.\.\ code-block\ :: + |\.\.\ ipython\ :: + # directive should not have a space before :: + |\.\.\ \w+\ :: + + # Check for deprecated messages without sphinx directive + |(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.) + + # {foo!r} instead of {repr(foo)} + |!r} + + # builtin filter function + |(?obj`, not ` obj` + language: pygrep + entry: '[a-zA-Z0-9*]> ' + files: (\.pyx|\.pxi.in)$ + - id: incorrect-backticks + name: Check for backticks incorrectly rendering because of missing spaces + language: pygrep + entry: '[a-zA-Z0-9]\`\`?[a-zA-Z0-9]' + types: [rst] + files: ^doc/source/ + - id: seed-check-asv + name: Check for unnecessary random seeds in asv benchmarks + language: pygrep + entry: 'np\.random\.seed' + files: ^asv_bench/benchmarks + exclude: ^asv_bench/benchmarks/pandas_vb_common\.py + - id: np-testing-array-equal + name: Check for usage of numpy testing or array_equal + language: pygrep + entry: '(numpy|np)(\.testing|\.array_equal)' + files: ^pandas/tests/ + types: [python] + - id: invalid-ea-testing + name: Check for invalid EA testing + language: pygrep + entry: 'tm\.assert_(series|frame)_equal' + files: ^pandas/tests/extension/base + types: [python] + exclude: ^pandas/tests/extension/base/base\.py + - id: unwanted-patterns-in-tests + name: Unwanted patterns in tests + language: pygrep + entry: | + (?x) + # pytest.xfail instead of pytest.mark.xfail + pytest\.xfail + + # imports from pandas._testing instead of `import pandas._testing as tm` + |from\ pandas\._testing\ import + |from\ pandas\ import\ _testing\ as\ tm + + # No direct imports from conftest + |conftest\ import + |import\ conftest + + # pandas.testing instead of tm + |pd\.testing\. + + # pd.api.types instead of from pandas.api.types import ... + |(pd|pandas)\.api\.types\. + + # np.testing, np.array_equal + |(numpy|np)(\.testing|\.array_equal) + + # unittest.mock (use pytest builtin monkeypatch fixture instead) + |(unittest(\.| import )mock|mock\.Mock\(\)|mock\.patch) + + # pytest raises without context + |\s\ pytest.raises + + # pytest.warns (use tm.assert_produces_warning instead) + |pytest\.warns + files: ^pandas/tests/ + types_or: [python, cython, rst] + - id: unwanted-patterns-in-ea-tests + name: Unwanted patterns in EA tests + language: pygrep + entry: | + (?x) + tm.assert_(series|frame)_equal + files: ^pandas/tests/extension/base/ + exclude: ^pandas/tests/extension/base/base\.py$ + types_or: [python, cython, rst] + - id: unwanted-patterns-in-cython + name: Unwanted patterns in Cython code + language: pygrep + entry: | + (?x) + # `obj` as opposed to ` obj` + [a-zA-Z0-9*]>[ ] + types: [cython] + - id: pip-to-conda + name: Generate pip dependency from conda + language: python + entry: python scripts/generate_pip_deps_from_conda.py + files: ^(environment.yml|requirements-dev.txt)$ + pass_filenames: false + additional_dependencies: [pyyaml, toml] + - id: title-capitalization + name: Validate correct capitalization among titles in documentation + entry: python scripts/validate_rst_title_capitalization.py + language: python + types: [rst] + files: ^doc/source/(development|reference)/ + - id: unwanted-patterns-bare-pytest-raises + name: Check for use of bare pytest raises + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="bare_pytest_raises" + types: [python] + files: ^pandas/tests/ + exclude: ^pandas/tests/extension/ + - id: unwanted-patterns-private-function-across-module + name: Check for use of private functions across modules + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" + types: [python] + exclude: ^(asv_bench|pandas/tests|doc)/ + - id: unwanted-patterns-private-import-across-module + name: Check for import of private attributes across modules + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="private_import_across_module" + types: [python] + exclude: | + (?x) + ^(asv_bench|pandas/tests|doc)/ + |scripts/validate_min_versions_in_sync\.py$ + - id: unwanted-patterns-strings-to-concatenate + name: Check for use of not concatenated strings + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="strings_to_concatenate" + types_or: [python, cython] + - id: unwanted-patterns-strings-with-misplaced-whitespace + name: Check for strings with misplaced spaces + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" + types_or: [python, cython] + - id: use-pd_array-in-core + name: Import pandas.array as pd_array in core + language: python + entry: python scripts/use_pd_array_in_core.py + files: ^pandas/core/ + exclude: ^pandas/core/api\.py$ + types: [python] + - id: use-io-common-urlopen + name: Use pandas.io.common.urlopen instead of urllib.request.urlopen + language: python + entry: python scripts/use_io_common_urlopen.py + files: ^pandas/ + exclude: ^pandas/tests/ + types: [python] + - id: no-bool-in-core-generic + name: Use bool_t instead of bool in pandas/core/generic.py + entry: python scripts/no_bool_in_generic.py + language: python + files: ^pandas/core/generic\.py$ + - id: no-return-exception + name: Use raise instead of return for exceptions + language: pygrep + entry: 'return [A-Za-z]+(Error|Exit|Interrupt|Exception|Iteration)' + files: ^pandas/ + types: [python] + exclude: ^pandas/tests/ + - id: pandas-errors-documented + name: Ensure pandas errors are documented in doc/source/reference/testing.rst + entry: python scripts/pandas_errors_documented.py + language: python + files: ^pandas/errors/__init__.py$ + - id: pg8000-not-installed-CI + name: Check for pg8000 not installed on CI for test_pg8000_sqlalchemy_passthrough_error + language: pygrep + entry: 'pg8000' + files: ^ci/deps + types: [yaml] + - id: validate-min-versions-in-sync + name: Check minimum version of dependencies are aligned + entry: python scripts/validate_min_versions_in_sync.py + language: python + files: ^(ci/deps/actions-.*-minimum_versions\.yaml|pandas/compat/_optional\.py)$ + additional_dependencies: [tomli] + - id: validate-errors-locations + name: Validate errors locations + description: Validate errors are in appropriate locations. + entry: python scripts/validate_exception_location.py + language: python + files: ^pandas/ + exclude: ^(pandas/_libs/|pandas/tests/|pandas/errors/__init__.py$|pandas/_version.py) + types: [python] + - id: flake8-pyi + name: flake8-pyi + entry: flake8 --extend-ignore=E301,E302,E305,E701,E704 + types: [pyi] + language: python + additional_dependencies: + - flake8==5.0.4 + - flake8-pyi==22.8.1 + - id: future-annotations + name: import annotations from __future__ + entry: 'from __future__ import annotations' + language: pygrep + args: [--negate] + files: ^pandas/ + types: [python] + exclude: | + (?x) + /(__init__\.py)|(api\.py)|(_version\.py)|(testing\.py)|(conftest\.py)$ + |/tests/ + |/_testing/ + - id: autotyping + name: autotyping + entry: python -m scripts.run_autotyping + types_or: [python, pyi] + files: ^pandas + exclude: ^(pandas/tests|pandas/_version.py|pandas/io/clipboard) + language: python + stages: [manual] + additional_dependencies: + - autotyping==22.9.0 + - libcst==0.4.7 + - id: check-test-naming + name: check that test names start with 'test' + entry: python -m scripts.check_test_naming + types: [python] + files: ^pandas/tests + language: python diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index b016cf386098e..0000000000000 --- a/.travis.yml +++ /dev/null @@ -1,105 +0,0 @@ -language: python -python: 3.7 - -# To turn off cached cython files and compiler cache -# set NOCACHE-true -# To delete caches go to https://travis-ci.org/OWNER/REPOSITORY/caches or run -# travis cache --delete inside the project directory from the travis command line client -# The cache directories will be deleted if anything in ci/ changes in a commit -cache: - ccache: true - directories: - - $HOME/.cache # cython cache - - $HOME/.ccache # compiler cache - -env: - global: - # Variable for test workers - - PYTEST_WORKERS="auto" - # create a github personal access token - # cd pandas-dev/pandas - # travis encrypt 'PANDAS_GH_TOKEN=personal_access_token' -r pandas-dev/pandas - - secure: "EkWLZhbrp/mXJOx38CHjs7BnjXafsqHtwxPQrqWy457VDFWhIY1DMnIR/lOWG+a20Qv52sCsFtiZEmMfUjf0pLGXOqurdxbYBGJ7/ikFLk9yV2rDwiArUlVM9bWFnFxHvdz9zewBH55WurrY4ShZWyV+x2dWjjceWG5VpWeI6sA=" - -git: - # for cloning - depth: false - -matrix: - fast_finish: true - - include: - # In allowed failures - - dist: bionic - python: 3.9-dev - env: - - JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)" - - env: - - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network and not clipboard)" - - - env: - - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network and not clipboard)" - - - arch: arm64 - env: - - JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard)" - - - env: - - JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" - services: - - mysql - - postgresql - - - env: - # Enabling Deprecations when running tests - # PANDAS_TESTING_MODE="deprecate" causes DeprecationWarning messages to be displayed in the logs - # See pandas/_testing.py for more details. - - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" - services: - - mysql - - postgresql - allow_failures: - - arch: arm64 - env: - - JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard)" - - dist: bionic - env: - - JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)" - - -before_install: - - echo "before_install" - # Use blocking IO on travis. Ref: https://github.com/travis-ci/travis-ci/issues/8920#issuecomment-352661024 - - python -c 'import os,sys,fcntl; flags = fcntl.fcntl(sys.stdout, fcntl.F_GETFL); fcntl.fcntl(sys.stdout, fcntl.F_SETFL, flags&~os.O_NONBLOCK);' - - source ci/travis_process_gbq_encryption.sh - - export PATH="$HOME/miniconda3/bin:$PATH" - - df -h - - pwd - - uname -a - - git --version - - ./ci/check_git_tags.sh - # Because travis runs on Google Cloud and has a /etc/boto.cfg, - # it breaks moto import, see: - # https://github.com/spulec/moto/issues/1771 - # https://github.com/boto/boto/issues/3741 - # This overrides travis and tells it to look nowhere. - - export BOTO_CONFIG=/dev/null - -install: - - echo "install start" - - ci/prep_cython_cache.sh - - ci/setup_env.sh - - ci/submit_cython_cache.sh - - echo "install done" - -script: - - echo "script start" - - echo "$JOB" - - if [ "$JOB" != "3.9-dev" ]; then source activate pandas-dev; fi - - ci/run_tests.sh - -after_script: - - echo "after_script start" - - source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - - ci/print_skipped.py - - echo "after_script done" diff --git a/AUTHORS.md b/AUTHORS.md index f576e333f9448..84fcfe05e3043 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -54,4 +54,3 @@ pandas is distributed under a 3-clause ("Simplified" or "New") BSD license. Parts of NumPy, SciPy, numpydoc, bottleneck, which all have BSD-compatible licenses, are included. Their licenses follow the pandas license. - diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000000..0161dfa92fdef --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,10 @@ +cff-version: 1.2.0 +title: 'pandas-dev/pandas: Pandas' +message: 'If you use this software, please cite it as below.' +authors: + - name: "The pandas development team" +license: BSD-3-Clause +license-url: "https://github.com/pandas-dev/pandas/blob/main/LICENSE" +repository-code: "https://github.com/pandas-dev/pandas" +type: software +url: "https://github.com/pandas-dev/pandas" diff --git a/Dockerfile b/Dockerfile index b8aff5d671dcf..7230dcab20f6e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,47 +1,13 @@ -FROM continuumio/miniconda3 +FROM python:3.10.8 +WORKDIR /home/pandas -# if you forked pandas, you can pass in your own GitHub username to use your fork -# i.e. gh_username=myname -ARG gh_username=pandas-dev -ARG pandas_home="/home/pandas" +RUN apt-get update && apt-get -y upgrade +RUN apt-get install -y build-essential -# Avoid warnings by switching to noninteractive -ENV DEBIAN_FRONTEND=noninteractive +# hdf5 needed for pytables installation +RUN apt-get install -y libhdf5-dev -# Configure apt and install packages -RUN apt-get update \ - && apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \ - # - # Verify git, process tools, lsb-release (common in install instructions for CLIs) installed - && apt-get -y install git iproute2 procps iproute2 lsb-release \ - # - # Install C compilers (gcc not enough, so just went with build-essential which admittedly might be overkill), - # needed to build pandas C extensions - && apt-get -y install build-essential \ - # - # cleanup - && apt-get autoremove -y \ - && apt-get clean -y \ - && rm -rf /var/lib/apt/lists/* - -# Switch back to dialog for any ad-hoc use of apt-get -ENV DEBIAN_FRONTEND=dialog - -# Clone pandas repo -RUN mkdir "$pandas_home" \ - && git clone "https://github.com/$gh_username/pandas.git" "$pandas_home" \ - && cd "$pandas_home" \ - && git remote add upstream "https://github.com/pandas-dev/pandas.git" \ - && git pull upstream master - -# Because it is surprisingly difficult to activate a conda environment inside a DockerFile -# (from personal experience and per https://github.com/ContinuumIO/docker-images/issues/89), -# we just update the base/root one from the 'environment.yml' file instead of creating a new one. -# -# Set up environment -RUN conda env update -n base -f "$pandas_home/environment.yml" - -# Build C extensions and pandas -RUN cd "$pandas_home" \ - && python setup.py build_ext --inplace -j 4 \ - && python -m pip install -e . +RUN python -m pip install --upgrade pip +RUN python -m pip install \ + -r https://raw.githubusercontent.com/pandas-dev/pandas/main/requirements-dev.txt +CMD ["/bin/bash"] diff --git a/LICENSE b/LICENSE index 76954a5a339ab..cdfa749dc34df 100644 --- a/LICENSE +++ b/LICENSE @@ -3,7 +3,7 @@ BSD 3-Clause License Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team All rights reserved. -Copyright (c) 2011-2020, Open source contributors. +Copyright (c) 2011-2023, Open source contributors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/LICENSES/KLIB_LICENSE b/LICENSES/KLIB_LICENSE new file mode 100644 index 0000000000000..0a996fae3360f --- /dev/null +++ b/LICENSES/KLIB_LICENSE @@ -0,0 +1,23 @@ +The MIT License + +Copyright (c) 2008- Attractive Chaos + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/LICENSES/OTHER b/LICENSES/OTHER index f0550b4ee208a..7446d68eb43a6 100644 --- a/LICENSES/OTHER +++ b/LICENSES/OTHER @@ -1,8 +1,3 @@ -numpydoc license ----------------- - -The numpydoc license is in pandas/doc/sphinxext/LICENSE.txt - Bottleneck license ------------------ @@ -77,4 +72,4 @@ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/LICENSES/PACKAGING_LICENSE b/LICENSES/PACKAGING_LICENSE new file mode 100644 index 0000000000000..4216ea1ce2379 --- /dev/null +++ b/LICENSES/PACKAGING_LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + +Copyright (c) Donald Stufft and individual contributors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/LICENSES/PYUPGRADE_LICENSE b/LICENSES/PYUPGRADE_LICENSE new file mode 100644 index 0000000000000..522fbe20b8991 --- /dev/null +++ b/LICENSES/PYUPGRADE_LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2017 Anthony Sottile + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/LICENSES/ULTRAJSON_LICENSE b/LICENSES/ULTRAJSON_LICENSE index 3b2886eb9cfae..a905fb017d813 100644 --- a/LICENSES/ULTRAJSON_LICENSE +++ b/LICENSES/ULTRAJSON_LICENSE @@ -28,7 +28,7 @@ Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. -Numeric decoder derived from from TCL library +Numeric decoder derived from TCL library http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms * Copyright (c) 1988-1993 The Regents of the University of California. * Copyright (c) 1994 Sun Microsystems, Inc. diff --git a/MANIFEST.in b/MANIFEST.in index cf6a1835433a4..d2b1b8cb887bc 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,9 +1,5 @@ -include MANIFEST.in -include LICENSE include RELEASE.md -include README.md -include setup.py -include pyproject.toml +include versioneer.py graft doc prune doc/build @@ -16,30 +12,49 @@ global-exclude *.bz2 global-exclude *.csv global-exclude *.dta global-exclude *.feather +global-exclude *.tar global-exclude *.gz global-exclude *.h5 global-exclude *.html global-exclude *.json +global-exclude *.jsonl +global-exclude *.msgpack +global-exclude *.pdf global-exclude *.pickle global-exclude *.png -global-exclude *.pyc -global-exclude *.pyd +global-exclude *.pptx global-exclude *.ods global-exclude *.odt +global-exclude *.orc global-exclude *.sas7bdat global-exclude *.sav global-exclude *.so global-exclude *.xls +global-exclude *.xlsb global-exclude *.xlsm global-exclude *.xlsx global-exclude *.xpt +global-exclude *.cpt global-exclude *.xz global-exclude *.zip +global-exclude *.zst global-exclude *~ global-exclude .DS_Store global-exclude .git* global-exclude \#* -include versioneer.py -include pandas/_version.py -include pandas/io/formats/templates/*.tpl +global-exclude *.c +global-exclude *.cpp +global-exclude *.h + +global-exclude *.py[ocd] +global-exclude *.pxi + +# GH 39321 +# csv_dir_path fixture checks the existence of the directory +# exclude the whole directory to avoid running related tests in sdist +prune pandas/tests/io/parser/data + +# Selectively re-add *.cxx files that were excluded above +graft pandas/_libs/src +graft pandas/_libs/tslibs/src diff --git a/Makefile b/Makefile deleted file mode 100644 index f26689ab65ba5..0000000000000 --- a/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -.PHONY : develop build clean clean_pyc doc lint-diff black - -all: develop - -clean: - -python setup.py clean - -clean_pyc: - -find . -name '*.py[co]' -exec rm {} \; - -build: clean_pyc - python setup.py build_ext --inplace - -lint-diff: - git diff upstream/master --name-only -- "*.py" | xargs flake8 - -black: - black . - -develop: build - python -m pip install --no-build-isolation -e . - -doc: - -rm -rf doc/build doc/source/generated - cd doc; \ - python make.py clean; \ - python make.py html diff --git a/README.md b/README.md index a72e8402e68a0..38f4afb3e2f22 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@
-
+
----------------- @@ -9,14 +9,13 @@ [![Conda Latest Release](https://anaconda.org/conda-forge/pandas/badges/version.svg)](https://anaconda.org/anaconda/pandas/) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3509134.svg)](https://doi.org/10.5281/zenodo.3509134) [![Package Status](https://img.shields.io/pypi/status/pandas.svg)](https://pypi.org/project/pandas/) -[![License](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/master/LICENSE) -[![Travis Build Status](https://travis-ci.org/pandas-dev/pandas.svg?branch=master)](https://travis-ci.org/pandas-dev/pandas) -[![Azure Build Status](https://dev.azure.com/pandas-dev/pandas/_apis/build/status/pandas-dev.pandas?branch=master)](https://dev.azure.com/pandas-dev/pandas/_build/latest?definitionId=1&branch=master) -[![Coverage](https://codecov.io/github/pandas-dev/pandas/coverage.svg?branch=master)](https://codecov.io/gh/pandas-dev/pandas) -[![Downloads](https://anaconda.org/conda-forge/pandas/badges/downloads.svg)](https://pandas.pydata.org) -[![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/pydata/pandas) +[![License](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/main/LICENSE) +[![Coverage](https://codecov.io/github/pandas-dev/pandas/coverage.svg?branch=main)](https://codecov.io/gh/pandas-dev/pandas) +[![Downloads](https://static.pepy.tech/personalized-badge/pandas?period=month&units=international_system&left_color=black&right_color=orange&left_text=PyPI%20downloads%20per%20month)](https://pepy.tech/project/pandas) +[![Slack](https://img.shields.io/badge/join_Slack-information-brightgreen.svg?logo=slack)](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack) [![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://numfocus.org) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/) ## What is it? @@ -32,7 +31,7 @@ its way towards this goal. Here are just a few of the things that pandas does well: - Easy handling of [**missing data**][missing-data] (represented as - `NaN`) in floating point as well as non-floating point data + `NaN`, `NA`, or `NaT`) in floating point as well as non-floating point data - Size mutability: columns can be [**inserted and deleted**][insertion-deletion] from DataFrame and higher dimensional objects @@ -60,34 +59,34 @@ Here are just a few of the things that pandas does well: and saving/loading data from the ultrafast [**HDF5 format**][hdfstore] - [**Time series**][timeseries]-specific functionality: date range generation and frequency conversion, moving window statistics, - date shifting and lagging. - - - [missing-data]: https://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data - [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#column-selection-addition-deletion - [alignment]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html?highlight=alignment#intro-to-data-structures - [groupby]: https://pandas.pydata.org/pandas-docs/stable/groupby.html#group-by-split-apply-combine - [conversion]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe - [slicing]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#slicing-ranges - [fancy-indexing]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#advanced-indexing-with-ix - [subsetting]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing - [merging]: https://pandas.pydata.org/pandas-docs/stable/merging.html#database-style-dataframe-joining-merging - [joining]: https://pandas.pydata.org/pandas-docs/stable/merging.html#joining-on-index - [reshape]: https://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-and-pivot-tables - [pivot-table]: https://pandas.pydata.org/pandas-docs/stable/reshaping.html#pivot-tables-and-cross-tabulations - [mi]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#hierarchical-indexing-multiindex - [flat-files]: https://pandas.pydata.org/pandas-docs/stable/io.html#csv-text-files - [excel]: https://pandas.pydata.org/pandas-docs/stable/io.html#excel-files - [db]: https://pandas.pydata.org/pandas-docs/stable/io.html#sql-queries - [hdfstore]: https://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables - [timeseries]: https://pandas.pydata.org/pandas-docs/stable/timeseries.html#time-series-date-functionality + date shifting and lagging + + + [missing-data]: https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html + [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#column-selection-addition-deletion + [alignment]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html?highlight=alignment#intro-to-data-structures + [groupby]: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#group-by-split-apply-combine + [conversion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#dataframe + [slicing]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#slicing-ranges + [fancy-indexing]: https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced + [subsetting]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing + [merging]: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#database-style-dataframe-or-named-series-joining-merging + [joining]: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#joining-on-index + [reshape]: https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html + [pivot-table]: https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html + [mi]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#hierarchical-indexing-multiindex + [flat-files]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#csv-text-files + [excel]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#excel-files + [db]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#sql-queries + [hdfstore]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#hdf5-pytables + [timeseries]: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-series-date-functionality ## Where to get it The source code is currently hosted on GitHub at: https://github.com/pandas-dev/pandas Binary installers for the latest released version are available at the [Python -package index](https://pypi.org/project/pandas) and on conda. +Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://docs.conda.io/en/latest/). ```sh # conda @@ -100,15 +99,15 @@ pip install pandas ``` ## Dependencies -- [NumPy](https://www.numpy.org) -- [python-dateutil](https://labix.org/python-dateutil) -- [pytz](https://pythonhosted.org/pytz) +- [NumPy - Adds support for large, multi-dimensional arrays, matrices and high-level mathematical functions to operate on these arrays](https://www.numpy.org) +- [python-dateutil - Provides powerful extensions to the standard datetime module](https://dateutil.readthedocs.io/en/stable/index.html) +- [pytz - Brings the Olson tz database into Python which allows accurate and cross platform timezone calculations](https://github.com/stub42/pytz) See the [full installation instructions](https://pandas.pydata.org/pandas-docs/stable/install.html#dependencies) for minimum supported versions of required, recommended and optional dependencies. ## Installation from sources -To install pandas from source you need Cython in addition to the normal -dependencies above. Cython can be installed from pypi: +To install pandas from source you need [Cython](https://cython.org/) in addition to the normal +dependencies above. Cython can be installed from PyPI: ```sh pip install cython @@ -121,22 +120,20 @@ cloning the git repo), execute: python setup.py install ``` -or for installing in [development mode](https://pip.pypa.io/en/latest/reference/pip_install.html#editable-installs): +or for installing in [development mode](https://pip.pypa.io/en/latest/cli/pip_install/#install-editable): ```sh python -m pip install -e . --no-build-isolation --no-use-pep517 ``` -If you have `make`, you can also use `make develop` to run the same command. - or alternatively ```sh python setup.py develop ``` -See the full instructions for [installing from source](https://pandas.pydata.org/pandas-docs/stable/install.html#installing-from-source). +See the full instructions for [installing from source](https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html#installing-from-source). ## License [BSD 3](LICENSE) @@ -145,7 +142,7 @@ See the full instructions for [installing from source](https://pandas.pydata.org The official documentation is hosted on PyData.org: https://pandas.pydata.org/pandas-docs/stable ## Background -Work on ``pandas`` started at AQR (a quantitative hedge fund) in 2008 and +Work on ``pandas`` started at [AQR](https://www.aqr.com/) (a quantitative hedge fund) in 2008 and has been under active development since then. ## Getting Help @@ -154,13 +151,13 @@ For usage questions, the best place to go to is [StackOverflow](https://stackove Further, general questions and discussions can also take place on the [pydata mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata). ## Discussion and Development -Most development discussions take place on github in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions. +Most development discussions take place on GitHub in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Slack channel](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack) is available for quick development related questions. ## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas) All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome. -A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas.pydata.org/docs/dev/development/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub. +A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas.pydata.org/docs/dev/development/contributing.html)**. If you are simply looking to start working with the pandas codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [good first issue](https://github.com/pandas-dev/pandas/issues?labels=good+first+issue&sort=updated&state=open) where you could start out. @@ -168,6 +165,6 @@ You can also triage issues which may include reproducing bug reports, or asking Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it! -Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas). +Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Slack](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack). -As contributors and maintainers to this project, you are expected to abide by pandas' code of conduct. More information can be found at: [Contributor Code of Conduct](https://github.com/pandas-dev/pandas/blob/master/.github/CODE_OF_CONDUCT.md) +As contributors and maintainers to this project, you are expected to abide by pandas' code of conduct. More information can be found at: [Contributor Code of Conduct](https://github.com/pandas-dev/.github/blob/master/CODE_OF_CONDUCT.md) diff --git a/RELEASE.md b/RELEASE.md index 42cb82dfcf020..344a097a3e81e 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,6 +1,6 @@ Release Notes ============= -The list of changes to Pandas between each release can be found +The list of changes to pandas between each release can be found [here](https://pandas.pydata.org/pandas-docs/stable/whatsnew/index.html). For full details, see the commit logs at https://github.com/pandas-dev/pandas. diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 4583fac85b776..c503ae5e17471 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -13,6 +13,10 @@ // benchmarked "repo": "..", + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "default" (for mercurial). + "branches": ["main"], + // The tool to use to create environments. May be "conda", // "virtualenv" or other value depending on the plugins in use. // If missing or the empty string, the tool will be automatically @@ -25,8 +29,7 @@ // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. - // "pythons": ["2.7", "3.4"], - "pythons": ["3.6"], + "pythons": ["3.8"], // The matrix of dependencies to test. Each key is the name of a // package (in PyPI) and the values are version numbers. An empty @@ -39,26 +42,22 @@ // followed by the pip installed packages). "matrix": { "numpy": [], - "Cython": ["0.29.16"], + "Cython": ["0.29.33"], "matplotlib": [], "sqlalchemy": [], "scipy": [], "numba": [], "numexpr": [], "pytables": [null, ""], // platform dependent, see excludes below + "pyarrow": [], "tables": [null, ""], "openpyxl": [], "xlsxwriter": [], "xlrd": [], - "xlwt": [], "odfpy": [], - "pytest": [], "jinja2": [], - // If using Windows with python 2.7 and want to build using the - // mingw toolchain (rather than MSVC), uncomment the following line. - // "libpython": [], }, - "conda_channels": ["defaults", "conda-forge"], + "conda_channels": ["conda-forge"], // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional // key-value pairs to include/exclude. @@ -126,6 +125,7 @@ "regression_thresholds": { }, "build_command": - ["python setup.py build -j4", + ["python -m pip install versioneer[toml]", + "python setup.py build -j4", "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"], } diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 65e52e03c43c7..eef81242abc7c 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -2,8 +2,6 @@ import numpy as np -from pandas._libs import lib - import pandas as pd from .pandas_vb_common import tm @@ -16,21 +14,7 @@ pass -class MaybeConvertObjects: - def setup(self): - N = 10 ** 5 - - data = list(range(N)) - data[0] = pd.NaT - data = np.array(data) - self.data = data - - def time_maybe_convert_objects(self): - lib.maybe_convert_objects(self.data) - - class Factorize: - params = [ [True, False], [True, False], @@ -38,28 +22,38 @@ class Factorize: "int", "uint", "float", - "string", + "object", "datetime64[ns]", "datetime64[ns, tz]", "Int64", "boolean", + "string[pyarrow]", ], ] param_names = ["unique", "sort", "dtype"] def setup(self, unique, sort, dtype): - N = 10 ** 5 + N = 10**5 + string_index = tm.makeStringIndex(N) + string_arrow = None + if dtype == "string[pyarrow]": + try: + string_arrow = pd.array(string_index, dtype="string[pyarrow]") + except ImportError: + raise NotImplementedError + data = { - "int": pd.Int64Index(np.arange(N)), - "uint": pd.UInt64Index(np.arange(N)), - "float": pd.Float64Index(np.random.randn(N)), - "string": tm.makeStringIndex(N), + "int": pd.Index(np.arange(N), dtype="int64"), + "uint": pd.Index(np.arange(N), dtype="uint64"), + "float": pd.Index(np.random.randn(N), dtype="float64"), + "object": string_index, "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), "datetime64[ns, tz]": pd.date_range( "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" ), "Int64": pd.array(np.arange(N), dtype="Int64"), "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"), + "string[pyarrow]": string_arrow, }[dtype] if not unique: data = data.repeat(5) @@ -70,7 +64,6 @@ def time_factorize(self, unique, sort, dtype): class Duplicated: - params = [ [True, False], ["first", "last", False], @@ -79,11 +72,11 @@ class Duplicated: param_names = ["unique", "keep", "dtype"] def setup(self, unique, keep, dtype): - N = 10 ** 5 + N = 10**5 data = { - "int": pd.Int64Index(np.arange(N)), - "uint": pd.UInt64Index(np.arange(N)), - "float": pd.Float64Index(np.random.randn(N)), + "int": pd.Index(np.arange(N), dtype="int64"), + "uint": pd.Index(np.arange(N), dtype="uint64"), + "float": pd.Index(np.random.randn(N), dtype="float64"), "string": tm.makeStringIndex(N), "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), "datetime64[ns, tz]": pd.date_range( @@ -100,9 +93,31 @@ def time_duplicated(self, unique, keep, dtype): self.idx.duplicated(keep=keep) +class DuplicatedMaskedArray: + params = [ + [True, False], + ["first", "last", False], + ["Int64", "Float64"], + ] + param_names = ["unique", "keep", "dtype"] + + def setup(self, unique, keep, dtype): + N = 10**5 + data = pd.Series(np.arange(N), dtype=dtype) + data[list(range(1, N, 100))] = pd.NA + if not unique: + data = data.repeat(5) + self.ser = data + # cache is_unique + self.ser.is_unique + + def time_duplicated(self, unique, keep, dtype): + self.ser.duplicated(keep=keep) + + class Hashing: def setup_cache(self): - N = 10 ** 5 + N = 10**5 df = pd.DataFrame( { @@ -150,7 +165,7 @@ class Quantile: param_names = ["quantile", "interpolation", "dtype"] def setup(self, quantile, interpolation, dtype): - N = 10 ** 5 + N = 10**5 data = { "int": np.arange(N), "uint": np.arange(N).astype(np.uint64), @@ -163,7 +178,7 @@ def time_quantile(self, quantile, interpolation, dtype): class SortIntegerArray: - params = [10 ** 3, 10 ** 5] + params = [10**3, 10**5] def setup(self, N): data = np.arange(N, dtype=float) diff --git a/asv_bench/benchmarks/algos/__init__.py b/asv_bench/benchmarks/algos/__init__.py new file mode 100644 index 0000000000000..97c9ab09b9c6b --- /dev/null +++ b/asv_bench/benchmarks/algos/__init__.py @@ -0,0 +1,12 @@ +""" +algos/ directory is intended for individual functions from core.algorithms + +In many cases these algorithms are reachable in multiple ways: + algos.foo(x, y) + Series(x).foo(y) + Index(x).foo(y) + pd.array(x).foo(y) + +In most cases we profile the Series variant directly, trusting the performance +of the others to be highly correlated. +""" diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py new file mode 100644 index 0000000000000..ac79ab65cea81 --- /dev/null +++ b/asv_bench/benchmarks/algos/isin.py @@ -0,0 +1,340 @@ +import numpy as np + +from pandas import ( + Categorical, + Index, + NaT, + Series, + date_range, +) + +from ..pandas_vb_common import tm + + +class IsIn: + params = [ + "int64", + "uint64", + "object", + "Int64", + "boolean", + "bool", + "datetime64[ns]", + "category[object]", + "category[int]", + "str", + "string[python]", + "string[pyarrow]", + ] + param_names = ["dtype"] + + def setup(self, dtype): + N = 10000 + + self.mismatched = [NaT.to_datetime64()] * 2 + + if dtype in ["boolean", "bool"]: + self.series = Series(np.random.randint(0, 2, N)).astype(dtype) + self.values = [True, False] + + elif dtype == "datetime64[ns]": + # Note: values here is much larger than non-dt64ns cases + + # dti has length=115777 + dti = date_range(start="2015-10-26", end="2016-01-01", freq="50s") + self.series = Series(dti) + self.values = self.series._values[::3] + self.mismatched = [1, 2] + + elif dtype in ["category[object]", "category[int]"]: + # Note: sizes are different in this case than others + n = 5 * 10**5 + sample_size = 100 + + arr = list(np.random.randint(0, n // 10, size=n)) + if dtype == "category[object]": + arr = [f"s{i:04d}" for i in arr] + + self.values = np.random.choice(arr, sample_size) + self.series = Series(arr).astype("category") + + elif dtype in ["str", "string[python]", "string[pyarrow]"]: + try: + self.series = Series(tm.makeStringIndex(N), dtype=dtype) + except ImportError: + raise NotImplementedError + self.values = list(self.series[:2]) + + else: + self.series = Series(np.random.randint(1, 10, N)).astype(dtype) + self.values = [1, 2] + + self.cat_values = Categorical(self.values) + + def time_isin(self, dtype): + self.series.isin(self.values) + + def time_isin_categorical(self, dtype): + self.series.isin(self.cat_values) + + def time_isin_empty(self, dtype): + self.series.isin([]) + + def time_isin_mismatched_dtype(self, dtype): + self.series.isin(self.mismatched) + + +class IsinAlmostFullWithRandomInt: + params = [ + [np.float64, np.int64, np.uint64, np.object_], + range(10, 21), + ["inside", "outside"], + ] + param_names = ["dtype", "exponent", "title"] + + def setup(self, dtype, exponent, title): + M = 3 * 2 ** (exponent - 2) + # 0.77-the maximal share of occupied buckets + self.series = Series(np.random.randint(0, M, M)).astype(dtype) + + values = np.random.randint(0, M, M).astype(dtype) + if title == "inside": + self.values = values + elif title == "outside": + self.values = values + M + else: + raise ValueError(title) + + def time_isin(self, dtype, exponent, title): + self.series.isin(self.values) + + +class IsinWithRandomFloat: + params = [ + [np.float64, np.object_], + [ + 1_300, + 2_000, + 7_000, + 8_000, + 70_000, + 80_000, + 750_000, + 900_000, + ], + ["inside", "outside"], + ] + param_names = ["dtype", "size", "title"] + + def setup(self, dtype, size, title): + self.values = np.random.rand(size) + self.series = Series(self.values).astype(dtype) + np.random.shuffle(self.values) + + if title == "outside": + self.values = self.values + 0.1 + + def time_isin(self, dtype, size, title): + self.series.isin(self.values) + + +class IsinWithArangeSorted: + params = [ + [np.float64, np.int64, np.uint64, np.object_], + [ + 1_000, + 2_000, + 8_000, + 100_000, + 1_000_000, + ], + ] + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + self.series = Series(np.arange(size)).astype(dtype) + self.values = np.arange(size).astype(dtype) + + def time_isin(self, dtype, size): + self.series.isin(self.values) + + +class IsinWithArange: + params = [ + [np.float64, np.int64, np.uint64, np.object_], + [ + 1_000, + 2_000, + 8_000, + ], + [-2, 0, 2], + ] + param_names = ["dtype", "M", "offset_factor"] + + def setup(self, dtype, M, offset_factor): + offset = int(M * offset_factor) + tmp = Series(np.random.randint(offset, M + offset, 10**6)) + self.series = tmp.astype(dtype) + self.values = np.arange(M).astype(dtype) + + def time_isin(self, dtype, M, offset_factor): + self.series.isin(self.values) + + +class IsInFloat64: + params = [ + [np.float64, "Float64"], + ["many_different_values", "few_different_values", "only_nans_values"], + ] + param_names = ["dtype", "title"] + + def setup(self, dtype, title): + N_many = 10**5 + N_few = 10**6 + self.series = Series([1, 2], dtype=dtype) + + if title == "many_different_values": + # runtime is dominated by creation of the lookup-table + self.values = np.arange(N_many, dtype=np.float64) + elif title == "few_different_values": + # runtime is dominated by creation of the lookup-table + self.values = np.zeros(N_few, dtype=np.float64) + elif title == "only_nans_values": + # runtime is dominated by creation of the lookup-table + self.values = np.full(N_few, np.nan, dtype=np.float64) + else: + raise ValueError(title) + + def time_isin(self, dtype, title): + self.series.isin(self.values) + + +class IsInForObjects: + """ + A subset of the cartesian product of cases have special motivations: + + "nans" x "nans" + if nan-objects are different objects, + this has the potential to trigger O(n^2) running time + + "short" x "long" + running time dominated by the preprocessing + + "long" x "short" + running time dominated by look-up + + "long" x "long" + no dominating part + + "long_floats" x "long_floats" + because of nans floats are special + no dominating part + + """ + + variants = ["nans", "short", "long", "long_floats"] + + params = [variants, variants] + param_names = ["series_type", "vals_type"] + + def setup(self, series_type, vals_type): + N_many = 10**5 + + if series_type == "nans": + ser_vals = np.full(10**4, np.nan) + elif series_type == "short": + ser_vals = np.arange(2) + elif series_type == "long": + ser_vals = np.arange(N_many) + elif series_type == "long_floats": + ser_vals = np.arange(N_many, dtype=np.float_) + + self.series = Series(ser_vals).astype(object) + + if vals_type == "nans": + values = np.full(10**4, np.nan) + elif vals_type == "short": + values = np.arange(2) + elif vals_type == "long": + values = np.arange(N_many) + elif vals_type == "long_floats": + values = np.arange(N_many, dtype=np.float_) + + self.values = values.astype(object) + + def time_isin(self, series_type, vals_type): + self.series.isin(self.values) + + +class IsInLongSeriesLookUpDominates: + params = [ + ["int64", "int32", "float64", "float32", "object", "Int64", "Float64"], + [5, 1000], + ["random_hits", "random_misses", "monotone_hits", "monotone_misses"], + ] + param_names = ["dtype", "MaxNumber", "series_type"] + + def setup(self, dtype, MaxNumber, series_type): + N = 10**7 + + if series_type == "random_hits": + array = np.random.randint(0, MaxNumber, N) + if series_type == "random_misses": + array = np.random.randint(0, MaxNumber, N) + MaxNumber + if series_type == "monotone_hits": + array = np.repeat(np.arange(MaxNumber), N // MaxNumber) + if series_type == "monotone_misses": + array = np.arange(N) + MaxNumber + + self.series = Series(array).astype(dtype) + + self.values = np.arange(MaxNumber).astype(dtype.lower()) + + def time_isin(self, dtypes, MaxNumber, series_type): + self.series.isin(self.values) + + +class IsInLongSeriesValuesDominate: + params = [ + ["int64", "int32", "float64", "float32", "object", "Int64", "Float64"], + ["random", "monotone"], + ] + param_names = ["dtype", "series_type"] + + def setup(self, dtype, series_type): + N = 10**7 + + if series_type == "random": + vals = np.random.randint(0, 10 * N, N) + if series_type == "monotone": + vals = np.arange(N) + + self.values = vals.astype(dtype.lower()) + M = 10**6 + 1 + self.series = Series(np.arange(M)).astype(dtype) + + def time_isin(self, dtypes, series_type): + self.series.isin(self.values) + + +class IsInWithLongTupples: + def setup(self): + t = tuple(range(1000)) + self.series = Series([t] * 1000) + self.values = [t] + + def time_isin(self): + self.series.isin(self.values) + + +class IsInIndexes: + def setup(self): + self.range_idx = Index(range(1000)) + self.index = Index(list(range(1000))) + self.series = Series(np.random.randint(100_000, size=1000)) + + def time_isin_range_index(self): + self.series.isin(self.range_idx) + + def time_isin_index(self): + self.series.isin(self.index) diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index 3ef6ab6209ea7..ab3b38fee1b06 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -4,7 +4,13 @@ import numpy as np import pandas as pd -from pandas import DataFrame, Series, Timestamp, date_range, to_timedelta +from pandas import ( + DataFrame, + Series, + Timestamp, + date_range, + to_timedelta, +) import pandas._testing as tm from pandas.core.algorithms import checked_add_with_arr @@ -53,7 +59,7 @@ def time_frame_op_with_scalar(self, dtype, scalar, op): class OpWithFillValue: def setup(self): # GH#31300 - arr = np.arange(10 ** 6) + arr = np.arange(10**6) df = DataFrame({"A": arr}) ser = df["A"] @@ -87,7 +93,7 @@ class MixedFrameWithSeriesAxis: param_names = ["opname"] def setup(self, opname): - arr = np.arange(10 ** 6).reshape(1000, -1) + arr = np.arange(10**6).reshape(1000, -1) df = DataFrame(arr) df["C"] = 1.0 self.df = df @@ -100,6 +106,10 @@ def time_frame_op_with_series_axis0(self, opname): def time_frame_op_with_series_axis1(self, opname): getattr(operator, opname)(self.df, self.ser) + # exclude comparisons from the params for time_frame_op_with_series_axis1 + # since they do not do alignment so raise + time_frame_op_with_series_axis1.params = [params[0][6:]] + class FrameWithFrameWide: # Many-columns, mixed dtypes @@ -110,32 +120,40 @@ class FrameWithFrameWide: operator.add, operator.floordiv, operator.gt, - ] + ], + [ + # (n_rows, n_columns) + (1_000_000, 10), + (100_000, 100), + (10_000, 1000), + (1000, 10_000), + ], ] - param_names = ["op"] + param_names = ["op", "shape"] - def setup(self, op): + def setup(self, op, shape): # we choose dtypes so as to make the blocks # a) not perfectly match between right and left # b) appreciably bigger than single columns - n_cols = 2000 - n_rows = 500 + n_rows, n_cols = shape + + if op is operator.floordiv: + # floordiv is much slower than the other operations -> use less data + n_rows = n_rows // 10 # construct dataframe with 2 blocks - arr1 = np.random.randn(n_rows, int(n_cols / 2)).astype("f8") - arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("f4") - df = pd.concat( - [pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True, - ) + arr1 = np.random.randn(n_rows, n_cols // 2).astype("f8") + arr2 = np.random.randn(n_rows, n_cols // 2).astype("f4") + df = pd.concat([DataFrame(arr1), DataFrame(arr2)], axis=1, ignore_index=True) # should already be the case, but just to be sure df._consolidate_inplace() - # TODO: GH#33198 the setting here shoudlnt need two steps - arr1 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8") - arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("i8") - arr3 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8") + # TODO: GH#33198 the setting here shouldn't need two steps + arr1 = np.random.randn(n_rows, max(n_cols // 4, 3)).astype("f8") + arr2 = np.random.randn(n_rows, n_cols // 2).astype("i8") + arr3 = np.random.randn(n_rows, n_cols // 4).astype("f8") df2 = pd.concat( - [pd.DataFrame(arr1), pd.DataFrame(arr2), pd.DataFrame(arr3)], + [DataFrame(arr1), DataFrame(arr2), DataFrame(arr3)], axis=1, ignore_index=True, ) @@ -145,17 +163,16 @@ def setup(self, op): self.left = df self.right = df2 - def time_op_different_blocks(self, op): + def time_op_different_blocks(self, op, shape): # blocks (and dtypes) are not aligned op(self.left, self.right) - def time_op_same_blocks(self, op): + def time_op_same_blocks(self, op, shape): # blocks (and dtypes) are aligned op(self.left, self.left) class Ops: - params = [[True, False], ["default", 1]] param_names = ["use_numexpr", "threads"] @@ -187,7 +204,7 @@ def teardown(self, use_numexpr, threads): class Ops2: def setup(self): - N = 10 ** 3 + N = 10**3 self.df = DataFrame(np.random.randn(N, N)) self.df2 = DataFrame(np.random.randn(N, N)) @@ -239,12 +256,11 @@ def time_frame_series_dot(self): class Timeseries: - params = [None, "US/Eastern"] param_names = ["tz"] def setup(self, tz): - N = 10 ** 6 + N = 10**6 halfway = (N // 2) - 1 self.s = Series(date_range("20010101", periods=N, freq="T", tz=tz)) self.ts = self.s[halfway] @@ -266,7 +282,7 @@ def time_timestamp_ops_diff_with_shift(self, tz): class IrregularOps: def setup(self): - N = 10 ** 5 + N = 10**5 idx = date_range(start="1/1/2000", periods=N, freq="s") s = Series(np.random.randn(N), index=idx) self.left = s.sample(frac=1) @@ -290,7 +306,7 @@ class CategoricalComparisons: param_names = ["op"] def setup(self, op): - N = 10 ** 5 + N = 10**5 self.cat = pd.Categorical(list("aabbcd") * N, ordered=True) def time_categorical_op(self, op): @@ -298,12 +314,11 @@ def time_categorical_op(self, op): class IndexArithmetic: - params = ["float", "int"] param_names = ["dtype"] def setup(self, dtype): - N = 10 ** 6 + N = 10**6 indexes = {"int": "makeIntIndex", "float": "makeFloatIndex"} self.index = getattr(tm, indexes[dtype])(N) @@ -329,7 +344,7 @@ class NumericInferOps: param_names = ["dtype"] def setup(self, dtype): - N = 5 * 10 ** 5 + N = 5 * 10**5 self.df = DataFrame( {"A": np.arange(N).astype(dtype), "B": np.arange(N).astype(dtype)} ) @@ -353,7 +368,7 @@ def time_modulo(self, dtype): class DateInferOps: # from GH 7332 def setup_cache(self): - N = 5 * 10 ** 5 + N = 5 * 10**5 df = DataFrame({"datetime64": np.arange(N).astype("datetime64[ms]")}) df["timedelta"] = df["datetime64"] - df["datetime64"] return df @@ -369,12 +384,11 @@ def time_add_timedeltas(self, df): class AddOverflowScalar: - params = [1, -1, 0] param_names = ["scalar"] def setup(self, scalar): - N = 10 ** 6 + N = 10**6 self.arr = np.arange(N) def time_add_overflow_scalar(self, scalar): @@ -383,7 +397,7 @@ def time_add_overflow_scalar(self, scalar): class AddOverflowArray: def setup(self): - N = 10 ** 6 + N = 10**6 self.arr = np.arange(N) self.arr_rev = np.arange(-N, 0) self.arr_mixed = np.array([1, -1]).repeat(N / 2) @@ -406,7 +420,7 @@ def time_add_overflow_both_arg_nan(self): hcal = pd.tseries.holiday.USFederalHolidayCalendar() -# These offsets currently raise a NotImplimentedError with .apply_index() +# These offsets currently raise a NotImplementedError with .apply_index() non_apply = [ pd.offsets.Day(), pd.offsets.BYearEnd(), @@ -437,15 +451,14 @@ def time_add_overflow_both_arg_nan(self): class OffsetArrayArithmetic: - params = offsets param_names = ["offset"] def setup(self, offset): N = 10000 - rng = pd.date_range(start="1/1/2000", periods=N, freq="T") + rng = date_range(start="1/1/2000", periods=N, freq="T") self.rng = rng - self.ser = pd.Series(rng) + self.ser = Series(rng) def time_add_series_offset(self, offset): with warnings.catch_warnings(record=True): @@ -462,7 +475,7 @@ class ApplyIndex: def setup(self, offset): N = 10000 - rng = pd.date_range(start="1/1/2000", periods=N, freq="T") + rng = date_range(start="1/1/2000", periods=N, freq="T") self.rng = rng def time_apply_index(self, offset): @@ -474,17 +487,17 @@ class BinaryOpsMultiIndex: param_names = ["func"] def setup(self, func): - date_range = pd.date_range("20200101 00:00", "20200102 0:00", freq="S") + array = date_range("20200101 00:00", "20200102 0:00", freq="S") level_0_names = [str(i) for i in range(30)] - index = pd.MultiIndex.from_product([level_0_names, date_range]) + index = pd.MultiIndex.from_product([level_0_names, array]) column_names = ["col_1", "col_2"] - self.df = pd.DataFrame( + self.df = DataFrame( np.random.rand(len(index), 2), index=index, columns=column_names ) - self.arg_df = pd.DataFrame( + self.arg_df = DataFrame( np.random.randint(1, 10, (len(level_0_names), 2)), index=level_0_names, columns=column_names, diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 103df0fd94847..ecd8c26ba6ca5 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -2,6 +2,8 @@ import pandas as pd +from .pandas_vb_common import tm + class BooleanArray: def setup(self): @@ -30,12 +32,111 @@ def time_from_float_array(self): class IntegerArray: def setup(self): - self.values_integer = np.array([1, 0, 1, 0]) - self.data = np.array([1, 2, 3, 4], dtype="int64") - self.mask = np.array([False, False, True, False]) + N = 250_000 + self.values_integer = np.array([1, 0, 1, 0] * N) + self.data = np.array([1, 2, 3, 4] * N, dtype="int64") + self.mask = np.array([False, False, True, False] * N) def time_constructor(self): pd.arrays.IntegerArray(self.data, self.mask) def time_from_integer_array(self): pd.array(self.values_integer, dtype="Int64") + + +class IntervalArray: + def setup(self): + N = 10_000 + self.tuples = [(i, i + 1) for i in range(N)] + + def time_from_tuples(self): + pd.arrays.IntervalArray.from_tuples(self.tuples) + + +class StringArray: + def setup(self): + N = 100_000 + values = tm.rands_array(3, N) + self.values_obj = np.array(values, dtype="object") + self.values_str = np.array(values, dtype="U") + self.values_list = values.tolist() + + def time_from_np_object_array(self): + pd.array(self.values_obj, dtype="string") + + def time_from_np_str_array(self): + pd.array(self.values_str, dtype="string") + + def time_from_list(self): + pd.array(self.values_list, dtype="string") + + +class ArrowStringArray: + params = [False, True] + param_names = ["multiple_chunks"] + + def setup(self, multiple_chunks): + try: + import pyarrow as pa + except ImportError: + raise NotImplementedError + strings = tm.rands_array(3, 10_000) + if multiple_chunks: + chunks = [strings[i : i + 100] for i in range(0, len(strings), 100)] + self.array = pd.arrays.ArrowStringArray(pa.chunked_array(chunks)) + else: + self.array = pd.arrays.ArrowStringArray(pa.array(strings)) + + def time_setitem(self, multiple_chunks): + for i in range(200): + self.array[i] = "foo" + + def time_setitem_list(self, multiple_chunks): + indexer = list(range(0, 50)) + list(range(-1000, 0, 50)) + self.array[indexer] = ["foo"] * len(indexer) + + def time_setitem_slice(self, multiple_chunks): + self.array[::10] = "foo" + + def time_setitem_null_slice(self, multiple_chunks): + self.array[:] = "foo" + + def time_tolist(self, multiple_chunks): + self.array.tolist() + + +class ArrowExtensionArray: + params = [ + [ + "boolean[pyarrow]", + "float64[pyarrow]", + "int64[pyarrow]", + "string[pyarrow]", + "timestamp[ns][pyarrow]", + ], + [False, True], + ] + param_names = ["dtype", "hasna"] + + def setup(self, dtype, hasna): + N = 100_000 + if dtype == "boolean[pyarrow]": + data = np.random.choice([True, False], N, replace=True) + elif dtype == "float64[pyarrow]": + data = np.random.randn(N) + elif dtype == "int64[pyarrow]": + data = np.arange(N) + elif dtype == "string[pyarrow]": + data = tm.rands_array(10, N) + elif dtype == "timestamp[ns][pyarrow]": + data = pd.date_range("2000-01-01", freq="s", periods=N) + else: + raise NotImplementedError + + arr = pd.array(data, dtype=dtype) + if hasna: + arr[::2] = pd.NA + self.arr = arr + + def time_to_numpy(self, dtype, hasna): + self.arr.to_numpy() diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index 9c7b107b478d4..2a004113d1b91 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -3,11 +3,6 @@ import pandas as pd from pandas import DataFrame -try: - from pandas.util import cache_readonly -except ImportError: - from pandas.util.decorators import cache_readonly - try: from pandas.core.construction import extract_array except ImportError: @@ -20,14 +15,13 @@ def setup(self): self.cur_index = self.df.index def time_get_index(self): - self.foo = self.df.index + self.df.index def time_set_index(self): self.df.index = self.cur_index class SeriesArrayAttribute: - params = [["numeric", "object", "category", "datetime64", "datetime64tz"]] param_names = ["dtype"] @@ -53,17 +47,4 @@ def time_extract_array_numpy(self, dtype): extract_array(self.series, extract_numpy=True) -class CacheReadonly: - def setup(self): - class Foo: - @cache_readonly - def prop(self): - return 5 - - self.obj = Foo() - - def time_cache_readonly(self): - self.obj.prop - - from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index a0b24342091ec..02747911d2226 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,3 +1,5 @@ +import string +import sys import warnings import numpy as np @@ -17,7 +19,7 @@ class Constructor: def setup(self): - N = 10 ** 5 + N = 10**5 self.categories = list("abcde") self.cat_idx = pd.Index(self.categories) self.values = np.tile(self.categories, N) @@ -67,28 +69,85 @@ def time_existing_series(self): pd.Categorical(self.series) +class AsType: + def setup(self): + N = 10**5 + + random_pick = np.random.default_rng().choice + + categories = { + "str": list(string.ascii_letters), + "int": np.random.randint(2**16, size=154), + "float": sys.maxsize * np.random.random((38,)), + "timestamp": [ + pd.Timestamp(x, unit="s") for x in np.random.randint(2**18, size=578) + ], + } + + self.df = pd.DataFrame( + {col: random_pick(cats, N) for col, cats in categories.items()} + ) + + for col in ("int", "float", "timestamp"): + self.df[col + "_as_str"] = self.df[col].astype(str) + + for col in self.df.columns: + self.df[col] = self.df[col].astype("category") + + def astype_str(self): + [self.df[col].astype("str") for col in "int float timestamp".split()] + + def astype_int(self): + [self.df[col].astype("int") for col in "int_as_str timestamp".split()] + + def astype_float(self): + [ + self.df[col].astype("float") + for col in "float_as_str int int_as_str timestamp".split() + ] + + def astype_datetime(self): + self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific")) + + class Concat: def setup(self): - N = 10 ** 5 + N = 10**5 self.s = pd.Series(list("aabbcd") * N).astype("category") self.a = pd.Categorical(list("aabbcd") * N) self.b = pd.Categorical(list("bbcdjk") * N) + self.idx_a = pd.CategoricalIndex(range(N), range(N)) + self.idx_b = pd.CategoricalIndex(range(N + 1), range(N + 1)) + self.df_a = pd.DataFrame(range(N), columns=["a"], index=self.idx_a) + self.df_b = pd.DataFrame(range(N + 1), columns=["a"], index=self.idx_b) + def time_concat(self): pd.concat([self.s, self.s]) def time_union(self): union_categoricals([self.a, self.b]) + def time_append_overlapping_index(self): + self.idx_a.append(self.idx_a) -class ValueCounts: + def time_append_non_overlapping_index(self): + self.idx_a.append(self.idx_b) + + def time_concat_overlapping_index(self): + pd.concat([self.df_a, self.df_a]) + def time_concat_non_overlapping_index(self): + pd.concat([self.df_a, self.df_b]) + + +class ValueCounts: params = [True, False] param_names = ["dropna"] def setup(self, dropna): - n = 5 * 10 ** 5 + n = 5 * 10**5 arr = [f"s{i:04d}" for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype("category") @@ -106,7 +165,7 @@ def time_rendering(self): class SetCategories: def setup(self): - n = 5 * 10 ** 5 + n = 5 * 10**5 arr = [f"s{i:04d}" for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype("category") @@ -116,7 +175,7 @@ def time_set_categories(self): class RemoveCategories: def setup(self): - n = 5 * 10 ** 5 + n = 5 * 10**5 arr = [f"s{i:04d}" for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype("category") @@ -126,8 +185,8 @@ def time_remove_categories(self): class Rank: def setup(self): - N = 10 ** 5 - ncats = 100 + N = 10**5 + ncats = 15 self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) self.s_str_cat = pd.Series(self.s_str, dtype="category") @@ -160,25 +219,6 @@ def time_rank_int_cat_ordered(self): self.s_int_cat_ordered.rank() -class Isin: - - params = ["object", "int64"] - param_names = ["dtype"] - - def setup(self, dtype): - np.random.seed(1234) - n = 5 * 10 ** 5 - sample_size = 100 - arr = list(np.random.randint(0, n // 10, size=n)) - if dtype == "object": - arr = [f"s{i:04d}" for i in arr] - self.sample = np.random.choice(arr, sample_size) - self.series = pd.Series(arr).astype("category") - - def time_isin_categorical(self, dtype): - self.series.isin(self.sample) - - class IsMonotonic: def setup(self): N = 1000 @@ -200,7 +240,7 @@ def time_categorical_series_is_monotonic_decreasing(self): class Contains: def setup(self): - N = 10 ** 5 + N = 10**5 self.ci = tm.makeCategoricalIndex(N) self.c = self.ci.values self.key = self.ci.categories[0] @@ -213,12 +253,11 @@ def time_categorical_contains(self): class CategoricalSlicing: - params = ["monotonic_incr", "monotonic_decr", "non_monotonic"] param_names = ["index"] def setup(self, index): - N = 10 ** 6 + N = 10**6 categories = ["a", "b", "c"] values = [0] * N + [1] * N + [2] * N if index == "monotonic_incr": @@ -254,7 +293,7 @@ def time_getitem_bool_array(self, index): class Indexing: def setup(self): - N = 10 ** 5 + N = 10**5 self.index = pd.CategoricalIndex(range(N), range(N)) self.series = pd.Series(range(N), index=self.index).sort_index() self.category = self.index[500] @@ -263,7 +302,7 @@ def time_get_loc(self): self.index.get_loc(self.category) def time_shallow_copy(self): - self.index._shallow_copy() + self.index._view() def time_align(self): pd.DataFrame({"a": self.series, "b": self.series[:500]}) @@ -286,7 +325,7 @@ def time_sort_values(self): class SearchSorted: def setup(self): - N = 10 ** 5 + N = 10**5 self.ci = tm.makeCategoricalIndex(N).sort_values() self.c = self.ci.values self.key = self.ci.categories[1] diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 7c43485f5ef45..2db00cc7f2ad9 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,6 +1,13 @@ import numpy as np -from pandas import DatetimeIndex, Index, MultiIndex, Series, Timestamp +from pandas import ( + DatetimeIndex, + Index, + MultiIndex, + Series, + Timestamp, + date_range, +) from .pandas_vb_common import tm @@ -42,7 +49,6 @@ def list_of_lists_with_none(arr): class SeriesConstructors: - param_names = ["data_fmt", "with_index", "dtype"] params = [ [ @@ -70,7 +76,7 @@ def setup(self, data_fmt, with_index, dtype): raise NotImplementedError( "Series constructors do not support using generators with indexes" ) - N = 10 ** 4 + N = 10**4 if dtype == "float": arr = np.random.randn(N) else: @@ -84,7 +90,7 @@ def time_series_constructor(self, data_fmt, with_index, dtype): class SeriesDtypesConstructors: def setup(self): - N = 10 ** 4 + N = 10**4 self.arr = np.random.randn(N) self.arr_str = np.array(["foo", "bar", "baz"], dtype=object) self.s = Series( @@ -108,11 +114,34 @@ def time_dtindex_from_index_with_series(self): class MultiIndexConstructor: def setup(self): - N = 10 ** 4 + N = 10**4 self.iterables = [tm.makeStringIndex(N), range(20)] def time_multiindex_from_iterables(self): MultiIndex.from_product(self.iterables) +class DatetimeIndexConstructor: + def setup(self): + N = 20_000 + dti = date_range("1900-01-01", periods=N) + + self.list_of_timestamps = dti.tolist() + self.list_of_dates = dti.date.tolist() + self.list_of_datetimes = dti.to_pydatetime().tolist() + self.list_of_str = dti.strftime("%Y-%m-%d").tolist() + + def time_from_list_of_timestamps(self): + DatetimeIndex(self.list_of_timestamps) + + def time_from_list_of_dates(self): + DatetimeIndex(self.list_of_dates) + + def time_from_list_of_datetimes(self): + DatetimeIndex(self.list_of_datetimes) + + def time_from_list_of_str(self): + DatetimeIndex(self.list_of_str) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index bd17b710b108d..52c87455b12b3 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -1,11 +1,18 @@ +import string + import numpy as np -from pandas.api.types import pandas_dtype +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm +from pandas.api.types import ( + is_extension_array_dtype, + pandas_dtype, +) from .pandas_vb_common import ( datetime_dtypes, extension_dtypes, - lib, numeric_dtypes, string_dtypes, ) @@ -41,25 +48,79 @@ def time_pandas_dtype_invalid(self, dtype): pass -class InferDtypes: +class SelectDtypes: + try: + params = [ + tm.ALL_INT_NUMPY_DTYPES + + tm.ALL_INT_EA_DTYPES + + tm.FLOAT_NUMPY_DTYPES + + tm.COMPLEX_DTYPES + + tm.DATETIME64_DTYPES + + tm.TIMEDELTA64_DTYPES + + tm.BOOL_DTYPES + ] + except AttributeError: + params = [ + tm.ALL_INT_DTYPES + + tm.ALL_EA_INT_DTYPES + + tm.FLOAT_DTYPES + + tm.COMPLEX_DTYPES + + tm.DATETIME64_DTYPES + + tm.TIMEDELTA64_DTYPES + + tm.BOOL_DTYPES + ] param_names = ["dtype"] - data_dict = { - "np-object": np.array([1] * 100000, dtype="O"), - "py-object": [1] * 100000, - "np-null": np.array([1] * 50000 + [np.nan] * 50000), - "py-null": [1] * 50000 + [None] * 50000, - "np-int": np.array([1] * 100000, dtype=int), - "np-floating": np.array([1.0] * 100000, dtype=float), - "empty": [], - "bytes": [b"a"] * 100000, - } - params = list(data_dict.keys()) - def time_infer_skipna(self, dtype): - lib.infer_dtype(self.data_dict[dtype], skipna=True) + def setup(self, dtype): + N, K = 5000, 50 + self.index = tm.makeStringIndex(N) + self.columns = tm.makeStringIndex(K) + + def create_df(data): + return DataFrame(data, index=self.index, columns=self.columns) + + self.df_int = create_df(np.random.randint(low=100, size=(N, K))) + self.df_float = create_df(np.random.randn(N, K)) + self.df_bool = create_df(np.random.choice([True, False], size=(N, K))) + self.df_string = create_df( + np.random.choice(list(string.ascii_letters), size=(N, K)) + ) + + def time_select_dtype_int_include(self, dtype): + self.df_int.select_dtypes(include=dtype) + + def time_select_dtype_int_exclude(self, dtype): + self.df_int.select_dtypes(exclude=dtype) + + def time_select_dtype_float_include(self, dtype): + self.df_float.select_dtypes(include=dtype) + + def time_select_dtype_float_exclude(self, dtype): + self.df_float.select_dtypes(exclude=dtype) + + def time_select_dtype_bool_include(self, dtype): + self.df_bool.select_dtypes(include=dtype) + + def time_select_dtype_bool_exclude(self, dtype): + self.df_bool.select_dtypes(exclude=dtype) + + def time_select_dtype_string_include(self, dtype): + self.df_string.select_dtypes(include=dtype) + + def time_select_dtype_string_exclude(self, dtype): + self.df_string.select_dtypes(exclude=dtype) + + +class CheckDtypes: + def setup(self): + self.ext_dtype = pd.Int64Dtype() + self.np_dtype = np.dtype("int64") + + def time_is_extension_array_dtype_true(self): + is_extension_array_dtype(self.ext_dtype) - def time_infer(self, dtype): - lib.infer_dtype(self.data_dict[dtype], skipna=False) + def time_is_extension_array_dtype_false(self): + is_extension_array_dtype(self.np_dtype) from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index cbab9fdc9c0ba..8a3d224c59a09 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -9,7 +9,6 @@ class Eval: - params = [["numexpr", "python"], [1, "all"]] param_names = ["engine", "threads"] @@ -43,7 +42,7 @@ def teardown(self, engine, threads): class Query: def setup(self): - N = 10 ** 6 + N = 10**6 halfway = (N // 2) - 1 index = pd.date_range("20010101", periods=N, freq="T") s = pd.Series(index) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index dc6f45f810f3d..7092a679b8cf0 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,15 +1,30 @@ import numpy as np import pandas as pd -from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range +from pandas import ( + NA, + Categorical, + DataFrame, + Float64Dtype, + MultiIndex, + Series, + Timestamp, + date_range, +) from .pandas_vb_common import tm try: - from pandas.tseries.offsets import Nano, Hour + from pandas.tseries.offsets import ( + Hour, + Nano, + ) except ImportError: # For compatibility with older versions - from pandas.core.datetools import * # noqa + from pandas.core.datetools import ( + Hour, + Nano, + ) class FromDicts: @@ -22,6 +37,9 @@ def setup(self): self.dict_list = frame.to_dict(orient="records") self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)} + # arrays which we won't consolidate + self.dict_of_categoricals = {i: Categorical(np.arange(N)) for i in range(K)} + def time_list_of_dict(self): DataFrame(self.dict_list) @@ -41,6 +59,10 @@ def time_nested_dict_int64(self): # nested dict, integer indexes, regression described in #621 DataFrame(self.data2) + def time_dict_of_categoricals(self): + # dict of arrays that we won't consolidate + DataFrame(self.dict_of_categoricals) + class FromSeries: def setup(self): @@ -52,13 +74,11 @@ def time_mi_series(self): class FromDictwithTimestamp: - params = [Nano(1), Hour(1)] param_names = ["offset"] def setup(self, offset): - N = 10 ** 3 - np.random.seed(1234) + N = 10**3 idx = date_range(Timestamp("1/1/1900"), freq=offset, periods=N) df = DataFrame(np.random.randn(N, 10), index=idx) self.d = df.to_dict() @@ -68,7 +88,6 @@ def time_dict_with_timestamp_offsets(self, offset): class FromRecords: - params = [None, 1000] param_names = ["nrows"] @@ -95,7 +114,6 @@ def time_frame_from_ndarray(self): class FromLists: - goal_time = 0.2 def setup(self): @@ -108,7 +126,6 @@ def time_frame_from_lists(self): class FromRange: - goal_time = 0.2 def setup(self): @@ -119,8 +136,28 @@ def time_frame_from_range(self): self.df = DataFrame(self.data) -class FromArrays: +class FromScalar: + def setup(self): + self.nrows = 100_000 + + def time_frame_from_scalar_ea_float64(self): + DataFrame( + 1.0, + index=range(self.nrows), + columns=list("abc"), + dtype=Float64Dtype(), + ) + + def time_frame_from_scalar_ea_float64_na(self): + DataFrame( + NA, + index=range(self.nrows), + columns=list("abc"), + dtype=Float64Dtype(), + ) + +class FromArrays: goal_time = 0.2 def setup(self): diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 44f71b392c0eb..761a6f7e9c9c1 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -3,7 +3,16 @@ import numpy as np -from pandas import DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range +from pandas import ( + DataFrame, + MultiIndex, + NaT, + Series, + date_range, + isnull, + period_range, + timedelta_range, +) from .pandas_vb_common import tm @@ -19,31 +28,12 @@ def time_frame_get_numeric_data(self): self.df._get_numeric_data() -class Lookup: - def setup(self): - self.df = DataFrame(np.random.randn(10000, 8), columns=list("abcdefgh")) - self.df["foo"] = "bar" - self.row_labels = list(self.df.index[::10])[:900] - self.col_labels = list(self.df.columns) * 100 - self.row_labels_all = np.array( - list(self.df.index) * len(self.df.columns), dtype="object" - ) - self.col_labels_all = np.array( - list(self.df.columns) * len(self.df.index), dtype="object" - ) - - def time_frame_fancy_lookup(self): - self.df.lookup(self.row_labels, self.col_labels) - - def time_frame_fancy_lookup_all(self): - self.df.lookup(self.row_labels_all, self.col_labels_all) - - class Reindex: def setup(self): - N = 10 ** 3 + N = 10**3 self.df = DataFrame(np.random.randn(N * 10, N)) self.idx = np.arange(4 * N, 7 * N) + self.idx_cols = np.random.randint(0, N, N) self.df2 = DataFrame( { c: { @@ -60,10 +50,13 @@ def time_reindex_axis0(self): self.df.reindex(self.idx) def time_reindex_axis1(self): + self.df.reindex(columns=self.idx_cols) + + def time_reindex_axis1_missing(self): self.df.reindex(columns=self.idx) def time_reindex_both_axes(self): - self.df.reindex(index=self.idx, columns=self.idx) + self.df.reindex(index=self.idx, columns=self.idx_cols) def time_reindex_upcast(self): self.df2.reindex(np.random.permutation(range(1200))) @@ -71,7 +64,7 @@ def time_reindex_upcast(self): class Rename: def setup(self): - N = 10 ** 3 + N = 10**3 self.df = DataFrame(np.random.randn(N * 10, N)) self.idx = np.arange(4 * N, 7 * N) self.dict_idx = {k: k for k in self.idx} @@ -219,11 +212,87 @@ def time_to_html_mixed(self): self.df2.to_html() +class ToDict: + params = [["dict", "list", "series", "split", "records", "index"]] + param_names = ["orient"] + + def setup(self, orient): + data = np.random.randint(0, 1000, size=(10000, 4)) + self.int_df = DataFrame(data) + self.datetimelike_df = self.int_df.astype("timedelta64[ns]") + + def time_to_dict_ints(self, orient): + self.int_df.to_dict(orient=orient) + + def time_to_dict_datetimelike(self, orient): + self.datetimelike_df.to_dict(orient=orient) + + +class ToNumpy: + def setup(self): + N = 10000 + M = 10 + self.df_tall = DataFrame(np.random.randn(N, M)) + self.df_wide = DataFrame(np.random.randn(M, N)) + self.df_mixed_tall = self.df_tall.copy() + self.df_mixed_tall["foo"] = "bar" + self.df_mixed_tall[0] = period_range("2000", periods=N) + self.df_mixed_tall[1] = range(N) + self.df_mixed_wide = self.df_wide.copy() + self.df_mixed_wide["foo"] = "bar" + self.df_mixed_wide[0] = period_range("2000", periods=M) + self.df_mixed_wide[1] = range(M) + + def time_to_numpy_tall(self): + self.df_tall.to_numpy() + + def time_to_numpy_wide(self): + self.df_wide.to_numpy() + + def time_to_numpy_mixed_tall(self): + self.df_mixed_tall.to_numpy() + + def time_to_numpy_mixed_wide(self): + self.df_mixed_wide.to_numpy() + + def time_values_tall(self): + self.df_tall.values + + def time_values_wide(self): + self.df_wide.values + + def time_values_mixed_tall(self): + self.df_mixed_tall.values + + def time_values_mixed_wide(self): + self.df_mixed_wide.values + + +class ToRecords: + def setup(self): + N = 100_000 + data = np.random.randn(N, 2) + mi = MultiIndex.from_arrays( + [ + np.arange(N), + date_range("1970-01-01", periods=N, freq="ms"), + ] + ) + self.df = DataFrame(data) + self.df_mi = DataFrame(data, index=mi) + + def time_to_records(self): + self.df.to_records(index=True) + + def time_to_records_multiindex(self): + self.df_mi.to_records(index=True) + + class Repr: def setup(self): nrows = 10000 data = np.random.randn(nrows, 10) - arrays = np.tile(np.random.randn(3, int(nrows / 100)), 100) + arrays = np.tile(np.random.randn(3, nrows // 100), 100) idx = MultiIndex.from_arrays(arrays) self.df3 = DataFrame(data, index=idx) self.df4 = DataFrame(data, index=np.random.randn(nrows)) @@ -260,7 +329,7 @@ def time_frame_mask_floats(self): class Isnull: def setup(self): - N = 10 ** 3 + N = 10**3 self.df_no_null = DataFrame(np.random.randn(N, N)) sample = np.array([np.nan, 1.0]) @@ -302,21 +371,46 @@ def time_isnull_obj(self): class Fillna: - - params = ([True, False], ["pad", "bfill"]) - param_names = ["inplace", "method"] - - def setup(self, inplace, method): - values = np.random.randn(10000, 100) - values[::2] = np.nan - self.df = DataFrame(values) - - def time_frame_fillna(self, inplace, method): + params = ( + [True, False], + ["pad", "bfill"], + [ + "float64", + "float32", + "object", + "Int64", + "Float64", + "datetime64[ns]", + "datetime64[ns, tz]", + "timedelta64[ns]", + ], + ) + param_names = ["inplace", "method", "dtype"] + + def setup(self, inplace, method, dtype): + N, M = 10000, 100 + if dtype in ("datetime64[ns]", "datetime64[ns, tz]", "timedelta64[ns]"): + data = { + "datetime64[ns]": date_range("2011-01-01", freq="H", periods=N), + "datetime64[ns, tz]": date_range( + "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" + ), + "timedelta64[ns]": timedelta_range(start="1 day", periods=N, freq="1D"), + } + self.df = DataFrame({f"col_{i}": data[dtype] for i in range(M)}) + self.df[::2] = None + else: + values = np.random.randn(N, M) + values[::2] = np.nan + if dtype == "Int64": + values = values.round() + self.df = DataFrame(values, dtype=dtype) + + def time_frame_fillna(self, inplace, method, dtype): self.df.fillna(inplace=inplace, method=method) class Dropna: - params = (["all", "any"], [0, 1]) param_names = ["how", "axis"] @@ -336,7 +430,6 @@ def time_dropna_axis_mixed_dtypes(self, how, axis): class Count: - params = [0, 1] param_names = ["axis"] @@ -358,10 +451,10 @@ def setup(self, axis): ) def time_count_level_multi(self, axis): - self.df.count(axis=axis, level=1) + self.df.count(axis=axis) def time_count_level_mixed_dtypes_multi(self, axis): - self.df_mixed.count(axis=axis, level=1) + self.df_mixed.count(axis=axis) class Apply: @@ -401,7 +494,7 @@ def time_frame_dtypes(self): class Equals: def setup(self): - N = 10 ** 3 + N = 10**3 self.float_df = DataFrame(np.random.randn(N, N)) self.float_df_nan = self.float_df.copy() self.float_df_nan.iloc[-1, -1] = np.nan @@ -435,15 +528,18 @@ def time_frame_object_unequal(self): class Interpolate: - params = [None, "infer"] param_names = ["downcast"] def setup(self, downcast): N = 10000 # this is the worst case, where every column has NaNs. - self.df = DataFrame(np.random.randn(N, 100)) - self.df.values[::2] = np.nan + arr = np.random.randn(N, 100) + # NB: we need to set values in array, not in df.values, otherwise + # the benchmark will be misleading for ArrayManager + arr[::2] = np.nan + + self.df = DataFrame(arr) self.df2 = DataFrame( { @@ -483,6 +579,14 @@ def time_frame_nunique(self): self.df.nunique() +class SeriesNuniqueWithNan: + def setup(self): + self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float) + + def time_series_nunique_nan(self): + self.ser.nunique() + + class Duplicated: def setup(self): n = 1 << 20 @@ -503,14 +607,16 @@ def time_frame_duplicated(self): def time_frame_duplicated_wide(self): self.df2.duplicated() + def time_frame_duplicated_subset(self): + self.df.duplicated(subset=["a"]) -class XS: +class XS: params = [0, 1] param_names = ["axis"] def setup(self, axis): - self.N = 10 ** 4 + self.N = 10**4 self.df = DataFrame(np.random.randn(self.N, self.N)) def time_frame_xs(self, axis): @@ -518,7 +624,6 @@ def time_frame_xs(self, axis): class SortValues: - params = [True, False] param_names = ["ascending"] @@ -546,7 +651,6 @@ def time_frame_sort_values_by_columns(self): class Quantile: - params = [0, 1] param_names = ["axis"] @@ -557,6 +661,21 @@ def time_frame_quantile(self, axis): self.df.quantile([0.1, 0.5], axis=axis) +class Rank: + param_names = ["dtype"] + params = [ + ["int", "uint", "float", "object"], + ] + + def setup(self, dtype): + self.df = DataFrame( + np.random.randn(10000, 10).astype(dtype), columns=range(10), dtype=dtype + ) + + def time_rank(self, dtype): + self.df.rank() + + class GetDtypeCounts: # 2807 def setup(self): @@ -571,7 +690,6 @@ def time_info(self): class NSort: - params = ["first", "last", "all"] param_names = ["keep"] @@ -595,9 +713,9 @@ class Describe: def setup(self): self.df = DataFrame( { - "a": np.random.randint(0, 100, int(1e6)), - "b": np.random.randint(0, 100, int(1e6)), - "c": np.random.randint(0, 100, int(1e6)), + "a": np.random.randint(0, 100, 10**6), + "b": np.random.randint(0, 100, 10**6), + "c": np.random.randint(0, 100, 10**6), } ) @@ -608,17 +726,6 @@ def time_dataframe_describe(self): self.df.describe() -class SelectDtypes: - params = [100, 1000] - param_names = ["n"] - - def setup(self, n): - self.df = DataFrame(np.random.randn(10, n)) - - def time_select_dtypes(self, n): - self.df.select_dtypes(include="int") - - class MemoryUsage: def setup(self): self.df = DataFrame(np.random.randn(100000, 2), columns=list("AB")) @@ -632,4 +739,22 @@ def time_memory_usage_object_dtype(self): self.df2.memory_usage(deep=True) +class Round: + def setup(self): + self.df = DataFrame(np.random.randn(10000, 10)) + self.df_t = self.df.transpose(copy=True) + + def time_round(self): + self.df.round() + + def time_round_transposed(self): + self.df_t.round() + + def peakmem_round(self): + self.df.round() + + def peakmem_round_transposed(self): + self.df_t.round() + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index e266d871f5bc6..4d5c31d2dddf8 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -1,20 +1,29 @@ +from functools import wraps +import threading + import numpy as np -from pandas import DataFrame, Series, date_range, factorize, read_csv -from pandas.core.algorithms import take_1d +from pandas import ( + DataFrame, + Series, + date_range, + factorize, + read_csv, +) +from pandas.core.algorithms import take_nd from .pandas_vb_common import tm try: from pandas import ( - rolling_median, + rolling_kurt, + rolling_max, rolling_mean, + rolling_median, rolling_min, - rolling_max, - rolling_var, rolling_skew, - rolling_kurt, rolling_std, + rolling_var, ) have_rolling_methods = True @@ -24,33 +33,66 @@ from pandas._libs import algos except ImportError: from pandas import algos -try: - from pandas._testing import test_parallel - have_real_test_parallel = True -except ImportError: - have_real_test_parallel = False - def test_parallel(num_threads=1): - def wrapper(fname): - return fname +from .pandas_vb_common import BaseIO # isort:skip - return wrapper +def test_parallel(num_threads=2, kwargs_list=None): + """ + Decorator to run the same function multiple times in parallel. -from .pandas_vb_common import BaseIO # isort:skip + Parameters + ---------- + num_threads : int, optional + The number of times the function is run in parallel. + kwargs_list : list of dicts, optional + The list of kwargs to update original + function kwargs on different threads. + Notes + ----- + This decorator does not pass the return value of the decorated function. -class ParallelGroupbyMethods: + Original from scikit-image: + + https://github.com/scikit-image/scikit-image/pull/1519 + + """ + assert num_threads > 0 + has_kwargs_list = kwargs_list is not None + if has_kwargs_list: + assert len(kwargs_list) == num_threads + + def wrapper(func): + @wraps(func) + def inner(*args, **kwargs): + if has_kwargs_list: + update_kwargs = lambda i: dict(kwargs, **kwargs_list[i]) + else: + update_kwargs = lambda i: kwargs + threads = [] + for i in range(num_threads): + updated_kwargs = update_kwargs(i) + thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs) + threads.append(thread) + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + return inner + return wrapper + + +class ParallelGroupbyMethods: params = ([2, 4, 8], ["count", "last", "max", "mean", "min", "prod", "sum", "var"]) param_names = ["threads", "method"] def setup(self, threads, method): - if not have_real_test_parallel: - raise NotImplementedError - N = 10 ** 6 - ngroups = 10 ** 3 + N = 10**6 + ngroups = 10**3 df = DataFrame( {"key": np.random.randint(0, ngroups, size=N), "data": np.random.randn(N)} ) @@ -75,15 +117,12 @@ def time_loop(self, threads, method): class ParallelGroups: - params = [2, 4, 8] param_names = ["threads"] def setup(self, threads): - if not have_real_test_parallel: - raise NotImplementedError - size = 2 ** 22 - ngroups = 10 ** 3 + size = 2**22 + ngroups = 10**3 data = Series(np.random.randint(0, ngroups, size=size)) @test_parallel(num_threads=threads) @@ -97,20 +136,17 @@ def time_get_groups(self, threads): class ParallelTake1D: - params = ["int64", "float64"] param_names = ["dtype"] def setup(self, dtype): - if not have_real_test_parallel: - raise NotImplementedError - N = 10 ** 6 + N = 10**6 df = DataFrame({"col": np.arange(N, dtype=dtype)}) indexer = np.arange(100, len(df) - 100) @test_parallel(num_threads=2) def parallel_take1d(): - take_1d(df["col"].values, indexer) + take_nd(df["col"].values, indexer) self.parallel_take1d = parallel_take1d @@ -119,15 +155,14 @@ def time_take1d(self, dtype): class ParallelKth: + # This depends exclusively on code in _libs/, could go in libs.py number = 1 repeat = 5 def setup(self): - if not have_real_test_parallel: - raise NotImplementedError - N = 10 ** 7 - k = 5 * 10 ** 5 + N = 10**7 + k = 5 * 10**5 kwargs_list = [{"arr": np.random.randn(N)}, {"arr": np.random.randn(N)}] @test_parallel(num_threads=2, kwargs_list=kwargs_list) @@ -142,9 +177,7 @@ def time_kth_smallest(self): class ParallelDatetimeFields: def setup(self): - if not have_real_test_parallel: - raise NotImplementedError - N = 10 ** 6 + N = 10**6 self.dti = date_range("1900-01-01", periods=N, freq="T") self.period = self.dti.to_period("D") @@ -192,13 +225,10 @@ def run(period): class ParallelRolling: - params = ["median", "mean", "min", "max", "var", "skew", "kurt", "std"] param_names = ["method"] def setup(self, method): - if not have_real_test_parallel: - raise NotImplementedError win = 100 arr = np.random.rand(100000) if hasattr(DataFrame, "rolling"): @@ -234,15 +264,12 @@ def time_rolling(self, method): class ParallelReadCSV(BaseIO): - number = 1 repeat = 5 params = ["float", "object", "datetime"] param_names = ["dtype"] def setup(self, dtype): - if not have_real_test_parallel: - raise NotImplementedError rows = 10000 cols = 50 data = { @@ -270,16 +297,12 @@ def time_read_csv(self, dtype): class ParallelFactorize: - number = 1 repeat = 5 params = [2, 4, 8] param_names = ["threads"] def setup(self, threads): - if not have_real_test_parallel: - raise NotImplementedError - strings = tm.makeStringIndex(100000) @test_parallel(num_threads=threads) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 5ffda03fad80f..4c0f3ddd826b7 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -5,19 +5,23 @@ import numpy as np from pandas import ( + NA, Categorical, DataFrame, + Index, MultiIndex, Series, Timestamp, date_range, period_range, + to_timedelta, ) from .pandas_vb_common import tm method_blocklist = { "object": { + "diff", "median", "prod", "sem", @@ -29,11 +33,9 @@ "skew", "cumprod", "cummax", - "rank", "pct_change", "min", "var", - "mad", "describe", "std", "quantile", @@ -50,7 +52,6 @@ "cummax", "pct_change", "var", - "mad", "describe", "std", }, @@ -69,9 +70,17 @@ def time_groupby_apply_dict_return(self): class Apply: - def setup_cache(self): - N = 10 ** 4 - labels = np.random.randint(0, 2000, size=N) + param_names = ["factor"] + params = [4, 5] + + def setup(self, factor): + N = 10**factor + # two cases: + # - small groups: small data (N**4) + many labels (2000) -> average group + # size of 5 (-> larger overhead of slicing method) + # - larger groups: larger data (N**5) + fewer labels (20) -> average group + # size of 5000 + labels = np.random.randint(0, 2000 if factor == 4 else 20, size=N) labels2 = np.random.randint(0, 3, size=N) df = DataFrame( { @@ -81,13 +90,13 @@ def setup_cache(self): "value2": ["foo", "bar", "baz", "qux"] * (N // 4), } ) - return df + self.df = df - def time_scalar_function_multi_col(self, df): - df.groupby(["key", "key2"]).apply(lambda x: 1) + def time_scalar_function_multi_col(self, factor): + self.df.groupby(["key", "key2"]).apply(lambda x: 1) - def time_scalar_function_single_col(self, df): - df.groupby("key").apply(lambda x: 1) + def time_scalar_function_single_col(self, factor): + self.df.groupby("key").apply(lambda x: 1) @staticmethod def df_copy_function(g): @@ -95,20 +104,31 @@ def df_copy_function(g): g.name return g.copy() - def time_copy_function_multi_col(self, df): - df.groupby(["key", "key2"]).apply(self.df_copy_function) + def time_copy_function_multi_col(self, factor): + self.df.groupby(["key", "key2"]).apply(self.df_copy_function) - def time_copy_overhead_single_col(self, df): - df.groupby("key").apply(self.df_copy_function) + def time_copy_overhead_single_col(self, factor): + self.df.groupby("key").apply(self.df_copy_function) -class Groups: +class ApplyNonUniqueUnsortedIndex: + def setup(self): + # GH 46527 + # unsorted and non-unique index + idx = np.arange(100)[::-1] + idx = Index(np.repeat(idx, 200), name="key") + self.df = DataFrame(np.random.randn(len(idx), 10), index=idx) + def time_groupby_apply_non_unique_unsorted_index(self): + self.df.groupby("key", group_keys=False).apply(lambda x: x) + + +class Groups: param_names = ["key"] params = ["int64_small", "int64_large", "object_small", "object_large"] def setup_cache(self): - size = 10 ** 6 + size = 10**6 data = { "int64_small": Series(np.random.randint(0, 100, size=size)), "int64_large": Series(np.random.randint(0, 10000, size=size)), @@ -127,9 +147,11 @@ def setup(self, data, key): def time_series_groups(self, data, key): self.ser.groupby(self.ser).groups + def time_series_indices(self, data, key): + self.ser.groupby(self.ser).indices -class GroupManyLabels: +class GroupManyLabels: params = [1, 1000] param_names = ["ncols"] @@ -144,12 +166,11 @@ def time_sum(self, ncols): class Nth: - param_names = ["dtype"] params = ["float32", "float64", "datetime", "object"] def setup(self, dtype): - N = 10 ** 5 + N = 10**5 # with datetimes (GH7555) if dtype == "datetime": values = date_range("1/1/2011", periods=N, freq="s") @@ -257,7 +278,7 @@ def time_multi_int_nunique(self, df): class AggFunctions: def setup_cache(self): - N = 10 ** 5 + N = 10**5 fac1 = np.array(["A", "B", "C"], dtype="O") fac2 = np.array(["one", "two"], dtype="O") df = DataFrame( @@ -285,12 +306,12 @@ def time_different_python_functions_multicol(self, df): df.groupby(["key1", "key2"]).agg([sum, min, max]) def time_different_python_functions_singlecol(self, df): - df.groupby("key1").agg([sum, min, max]) + df.groupby("key1")[["value1", "value2", "value3"]].agg([sum, min, max]) class GroupStrings: def setup(self): - n = 2 * 10 ** 5 + n = 2 * 10**5 alpha = list(map("".join, product(ascii_letters, repeat=4))) data = np.random.choice(alpha, (n // 5, 4), replace=False) data = np.repeat(data, 5, axis=0) @@ -304,7 +325,7 @@ def time_multi_columns(self): class MultiColumn: def setup_cache(self): - N = 10 ** 5 + N = 10**5 key1 = np.tile(np.arange(100, dtype=object), 1000) key2 = key1.copy() np.random.shuffle(key1) @@ -334,7 +355,7 @@ def time_col_select_numpy_sum(self, df): class Size: def setup(self): - n = 10 ** 5 + n = 10**5 offsets = np.random.randint(n, size=n).astype("timedelta64[ns]") dates = np.datetime64("now") + offsets self.df = DataFrame( @@ -358,11 +379,42 @@ def time_category_size(self): self.draws.groupby(self.cats).size() -class GroupByMethods: +class Shift: + def setup(self): + N = 18 + self.df = DataFrame({"g": ["a", "b"] * 9, "v": list(range(N))}) + + def time_defaults(self): + self.df.groupby("g").shift() + + def time_fill_value(self): + self.df.groupby("g").shift(fill_value=99) + + +class FillNA: + def setup(self): + N = 100 + self.df = DataFrame( + {"group": [1] * N + [2] * N, "value": [np.nan, 1.0] * N} + ).set_index("group") + + def time_df_ffill(self): + self.df.groupby("group").fillna(method="ffill") + + def time_df_bfill(self): + self.df.groupby("group").fillna(method="bfill") - param_names = ["dtype", "method", "application"] + def time_srs_ffill(self): + self.df.groupby("group")["value"].fillna(method="ffill") + + def time_srs_bfill(self): + self.df.groupby("group")["value"].fillna(method="bfill") + + +class GroupByMethods: + param_names = ["dtype", "method", "application", "ncols"] params = [ - ["int", "float", "object", "datetime"], + ["int", "int16", "float", "object", "datetime", "uint"], [ "all", "any", @@ -374,11 +426,11 @@ class GroupByMethods: "cumprod", "cumsum", "describe", + "diff", "ffill", "first", "head", "last", - "mad", "max", "min", "median", @@ -400,17 +452,43 @@ class GroupByMethods: "var", ], ["direct", "transformation"], + [1, 5], ] - def setup(self, dtype, method, application): + def setup(self, dtype, method, application, ncols): if method in method_blocklist.get(dtype, {}): raise NotImplementedError # skip benchmark - ngroups = 1000 + + if ncols != 1 and method in ["value_counts", "unique"]: + # DataFrameGroupBy doesn't have these methods + raise NotImplementedError + + if application == "transformation" and method in [ + "describe", + "head", + "tail", + "unique", + "value_counts", + "size", + ]: + # DataFrameGroupBy doesn't have these methods + raise NotImplementedError + + if method == "describe": + ngroups = 20 + elif method == "skew": + ngroups = 100 + else: + ngroups = 1000 size = ngroups * 2 - rng = np.arange(ngroups) - values = rng.take(np.random.randint(0, ngroups, size=size)) + rng = np.arange(ngroups).reshape(-1, 1) + rng = np.broadcast_to(rng, (len(rng), ncols)) + taker = np.random.randint(0, ngroups, size=size) + values = rng.take(taker, axis=0) if dtype == "int": key = np.random.randint(0, size, size=size) + elif dtype in ("int16", "uint"): + key = np.random.randint(0, size, size=size, dtype=dtype) elif dtype == "float": key = np.concatenate( [np.random.random(ngroups) * 0.1, np.random.random(ngroups) * 10.0] @@ -420,25 +498,134 @@ def setup(self, dtype, method, application): elif dtype == "datetime": key = date_range("1/1/2011", periods=size, freq="s") - df = DataFrame({"values": values, "key": key}) + cols = [f"values{n}" for n in range(ncols)] + df = DataFrame(values, columns=cols) + df["key"] = key - if application == "transform": - if method == "describe": - raise NotImplementedError + if len(cols) == 1: + cols = cols[0] - self.as_group_method = lambda: df.groupby("key")["values"].transform(method) - self.as_field_method = lambda: df.groupby("values")["key"].transform(method) + if application == "transformation": + self.as_group_method = lambda: df.groupby("key")[cols].transform(method) + self.as_field_method = lambda: df.groupby(cols)["key"].transform(method) else: - self.as_group_method = getattr(df.groupby("key")["values"], method) - self.as_field_method = getattr(df.groupby("values")["key"], method) + self.as_group_method = getattr(df.groupby("key")[cols], method) + self.as_field_method = getattr(df.groupby(cols)["key"], method) - def time_dtype_as_group(self, dtype, method, application): + def time_dtype_as_group(self, dtype, method, application, ncols): self.as_group_method() - def time_dtype_as_field(self, dtype, method, application): + def time_dtype_as_field(self, dtype, method, application, ncols): self.as_field_method() +class GroupByCythonAgg: + """ + Benchmarks specifically targeting our cython aggregation algorithms + (using a big enough dataframe with simple key, so a large part of the + time is actually spent in the grouped aggregation). + """ + + param_names = ["dtype", "method"] + params = [ + ["float64"], + [ + "sum", + "prod", + "min", + "max", + "mean", + "median", + "var", + "first", + "last", + "any", + "all", + ], + ] + + def setup(self, dtype, method): + N = 1_000_000 + df = DataFrame(np.random.randn(N, 10), columns=list("abcdefghij")) + df["key"] = np.random.randint(0, 100, size=N) + self.df = df + + def time_frame_agg(self, dtype, method): + self.df.groupby("key").agg(method) + + +class GroupByCythonAggEaDtypes: + """ + Benchmarks specifically targeting our cython aggregation algorithms + (using a big enough dataframe with simple key, so a large part of the + time is actually spent in the grouped aggregation). + """ + + param_names = ["dtype", "method"] + params = [ + ["Float64", "Int64", "Int32"], + [ + "sum", + "prod", + "min", + "max", + "mean", + "median", + "var", + "first", + "last", + "any", + "all", + ], + ] + + def setup(self, dtype, method): + N = 1_000_000 + df = DataFrame( + np.random.randint(0, high=100, size=(N, 10)), + columns=list("abcdefghij"), + dtype=dtype, + ) + df.loc[list(range(1, N, 5)), list("abcdefghij")] = NA + df["key"] = np.random.randint(0, 100, size=N) + self.df = df + + def time_frame_agg(self, dtype, method): + self.df.groupby("key").agg(method) + + +class Cumulative: + param_names = ["dtype", "method", "with_nans"] + params = [ + ["float64", "int64", "Float64", "Int64"], + ["cummin", "cummax", "cumsum"], + [True, False], + ] + + def setup(self, dtype, method, with_nans): + if with_nans and dtype == "int64": + raise NotImplementedError("Construction of df would raise") + + N = 500_000 + keys = np.random.randint(0, 100, size=N) + vals = np.random.randint(-10, 10, (N, 5)) + + if with_nans: + null_vals = vals.astype(float, copy=True) + null_vals[::2, :] = np.nan + null_vals[::3, :] = np.nan + df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype) + df["key"] = keys + self.df = df + else: + df = DataFrame(vals, columns=list("abcde")).astype(dtype, copy=False) + df["key"] = keys + self.df = df + + def time_frame_transform(self, dtype, method, with_nans): + self.df.groupby("key").transform(method) + + class RankWithTies: # GH 21237 param_names = ["dtype", "tie_method"] @@ -448,7 +635,7 @@ class RankWithTies: ] def setup(self, dtype, tie_method): - N = 10 ** 4 + N = 10**4 if dtype == "datetime64": data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype) else: @@ -466,15 +653,43 @@ def setup(self): tmp2 = (np.random.random(10000) * 10.0).astype(np.float32) tmp = np.concatenate((tmp1, tmp2)) arr = np.repeat(tmp, 10) - self.df = DataFrame(dict(a=arr, b=arr)) + self.df = DataFrame({"a": arr, "b": arr}) def time_sum(self): self.df.groupby(["a"])["b"].sum() +class String: + # GH#41596 + param_names = ["dtype", "method"] + params = [ + ["str", "string[python]"], + [ + "sum", + "min", + "max", + "first", + "last", + "any", + "all", + ], + ] + + def setup(self, dtype, method): + cols = list("abcdefghjkl") + self.df = DataFrame( + np.random.randint(0, 100, size=(10_000, len(cols))), + columns=cols, + dtype=dtype, + ) + + def time_str_func(self, dtype, method): + self.df.groupby("a")[self.df.columns[1:]].agg(method) + + class Categories: def setup(self): - N = 10 ** 5 + N = 10**5 arr = np.random.random(N) data = {"a": Categorical(np.random.randint(10000, size=N)), "b": arr} self.df = DataFrame(data) @@ -516,14 +731,14 @@ class Datelike: param_names = ["grouper"] def setup(self, grouper): - N = 10 ** 4 + N = 10**4 rng_map = { "period_range": period_range, "date_range": date_range, "date_range_tz": partial(date_range, tz="US/Central"), } self.grouper = rng_map[grouper]("1900-01-01", freq="D", periods=N) - self.df = DataFrame(np.random.randn(10 ** 4, 2)) + self.df = DataFrame(np.random.randn(10**4, 2)) def time_sum(self, grouper): self.df.groupby(self.grouper).sum() @@ -569,6 +784,18 @@ def setup(self): data = DataFrame(arr, index=index, columns=["col1", "col20", "col3"]) self.df = data + n = 1000 + self.df_wide = DataFrame( + np.random.randn(n, n), + index=np.random.choice(range(10), n), + ) + + n = 1_000_000 + self.df_tall = DataFrame( + np.random.randn(n, 3), + index=np.random.randint(0, 5, n), + ) + n = 20000 self.df1 = DataFrame( np.random.randint(1, n, (n, 3)), columns=["jim", "joe", "jolie"] @@ -588,6 +815,12 @@ def time_transform_lambda_max(self): def time_transform_ufunc_max(self): self.df.groupby(level="lev1").transform(np.max) + def time_transform_lambda_max_tall(self): + self.df_tall.groupby(level=0).transform(lambda x: np.max(x, axis=0)) + + def time_transform_lambda_max_wide(self): + self.df_wide.groupby(level=0).transform(lambda x: np.max(x, axis=0)) + def time_transform_multi_key1(self): self.df1.groupby(["jim", "joe"])["jolie"].transform("max") @@ -605,7 +838,7 @@ class TransformBools: def setup(self): N = 120000 transition_points = np.sort(np.random.choice(np.arange(N), 1400)) - transitions = np.zeros(N, dtype=np.bool) + transitions = np.zeros(N, dtype=np.bool_) transitions[transition_points] = True self.g = transitions.cumsum() self.df = DataFrame({"signal": np.random.rand(N)}) @@ -627,33 +860,41 @@ def time_first(self): class TransformEngine: - def setup(self): - N = 10 ** 3 + param_names = ["parallel"] + params = [[True, False]] + + def setup(self, parallel): + N = 10**3 data = DataFrame( {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}, columns=[0, 1], ) + self.parallel = parallel self.grouper = data.groupby(0) - def time_series_numba(self): + def time_series_numba(self, parallel): def function(values, index): return values * 5 - self.grouper[1].transform(function, engine="numba") + self.grouper[1].transform( + function, engine="numba", engine_kwargs={"parallel": self.parallel} + ) - def time_series_cython(self): + def time_series_cython(self, parallel): def function(values): return values * 5 self.grouper[1].transform(function, engine="cython") - def time_dataframe_numba(self): + def time_dataframe_numba(self, parallel): def function(values, index): return values * 5 - self.grouper.transform(function, engine="numba") + self.grouper.transform( + function, engine="numba", engine_kwargs={"parallel": self.parallel} + ) - def time_dataframe_cython(self): + def time_dataframe_cython(self, parallel): def function(values): return values * 5 @@ -661,15 +902,19 @@ def function(values): class AggEngine: - def setup(self): - N = 10 ** 3 + param_names = ["parallel"] + params = [[True, False]] + + def setup(self, parallel): + N = 10**3 data = DataFrame( {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}, columns=[0, 1], ) + self.parallel = parallel self.grouper = data.groupby(0) - def time_series_numba(self): + def time_series_numba(self, parallel): def function(values, index): total = 0 for i, value in enumerate(values): @@ -679,9 +924,11 @@ def function(values, index): total += value * 2 return total - self.grouper[1].agg(function, engine="numba") + self.grouper[1].agg( + function, engine="numba", engine_kwargs={"parallel": self.parallel} + ) - def time_series_cython(self): + def time_series_cython(self, parallel): def function(values): total = 0 for i, value in enumerate(values): @@ -693,7 +940,7 @@ def function(values): self.grouper[1].agg(function, engine="cython") - def time_dataframe_numba(self): + def time_dataframe_numba(self, parallel): def function(values, index): total = 0 for i, value in enumerate(values): @@ -703,9 +950,11 @@ def function(values, index): total += value * 2 return total - self.grouper.agg(function, engine="numba") + self.grouper.agg( + function, engine="numba", engine_kwargs={"parallel": self.parallel} + ) - def time_dataframe_cython(self): + def time_dataframe_cython(self, parallel): def function(values): total = 0 for i, value in enumerate(values): @@ -718,4 +967,45 @@ def function(values): self.grouper.agg(function, engine="cython") +class Sample: + def setup(self): + N = 10**3 + self.df = DataFrame({"a": np.zeros(N)}) + self.groups = np.arange(0, N) + self.weights = np.ones(N) + + def time_sample(self): + self.df.groupby(self.groups).sample(n=1) + + def time_sample_weights(self): + self.df.groupby(self.groups).sample(n=1, weights=self.weights) + + +class Resample: + # GH 28635 + def setup(self): + num_timedeltas = 20_000 + num_groups = 3 + + index = MultiIndex.from_product( + [ + np.arange(num_groups), + to_timedelta(np.arange(num_timedeltas), unit="s"), + ], + names=["groups", "timedeltas"], + ) + data = np.random.randint(0, 1000, size=(len(index))) + + self.df = DataFrame(data, index=index).reset_index("timedeltas") + self.df_multiindex = DataFrame(data, index=index) + + def time_resample(self): + self.df.groupby(level="groups").resample("10s", on="timedeltas").mean() + + def time_resample_multiindex(self): + self.df_multiindex.groupby(level="groups").resample( + "10s", level="timedeltas" + ).mean() + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py new file mode 100644 index 0000000000000..d2c5b4dfbef70 --- /dev/null +++ b/asv_bench/benchmarks/hash_functions.py @@ -0,0 +1,89 @@ +import numpy as np + +import pandas as pd + + +class UniqueForLargePyObjectInts: + def setup(self): + lst = [x << 32 for x in range(5000)] + self.arr = np.array(lst, dtype=np.object_) + + def time_unique(self): + pd.unique(self.arr) + + +class Float64GroupIndex: + # GH28303 + def setup(self): + self.df = pd.date_range( + start="1/1/2018", end="1/2/2018", periods=10**6 + ).to_frame() + self.group_index = np.round(self.df.index.astype(int) / 10**9) + + def time_groupby(self): + self.df.groupby(self.group_index).last() + + +class UniqueAndFactorizeArange: + params = range(4, 16) + param_names = ["exponent"] + + def setup(self, exponent): + a = np.arange(10**4, dtype="float64") + self.a2 = (a + 10**exponent).repeat(100) + + def time_factorize(self, exponent): + pd.factorize(self.a2) + + def time_unique(self, exponent): + pd.unique(self.a2) + + +class Unique: + params = ["Int64", "Float64"] + param_names = ["dtype"] + + def setup(self, dtype): + self.ser = pd.Series(([1, pd.NA, 2] + list(range(100_000))) * 3, dtype=dtype) + self.ser_unique = pd.Series(list(range(300_000)) + [pd.NA], dtype=dtype) + + def time_unique_with_duplicates(self, exponent): + pd.unique(self.ser) + + def time_unique(self, exponent): + pd.unique(self.ser_unique) + + +class NumericSeriesIndexing: + params = [ + (np.int64, np.uint64, np.float64), + (10**4, 10**5, 5 * 10**5, 10**6, 5 * 10**6), + ] + param_names = ["dtype", "N"] + + def setup(self, dtype, N): + vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)), dtype=dtype) + indices = pd.Index(vals) + self.data = pd.Series(np.arange(N), index=indices) + + def time_loc_slice(self, index, N): + # trigger building of mapping + self.data.loc[:800] + + +class NumericSeriesIndexingShuffled: + params = [ + (np.int64, np.uint64, np.float64), + (10**4, 10**5, 5 * 10**5, 10**6, 5 * 10**6), + ] + param_names = ["dtype", "N"] + + def setup(self, dtype, N): + vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)), dtype=dtype) + np.random.shuffle(vals) + indices = pd.Index(vals) + self.data = pd.Series(np.arange(N), index=indices) + + def time_loc_slice(self, index, N): + # trigger building of mapping + self.data.loc[:800] diff --git a/asv_bench/benchmarks/index_cached_properties.py b/asv_bench/benchmarks/index_cached_properties.py index 16fbc741775e4..b3d8de39a858a 100644 --- a/asv_bench/benchmarks/index_cached_properties.py +++ b/asv_bench/benchmarks/index_cached_properties.py @@ -22,7 +22,7 @@ class IndexCache: param_names = ["index_type"] def setup(self, index_type): - N = 10 ** 5 + N = 10**5 if index_type == "MultiIndex": self.idx = pd.MultiIndex.from_product( [pd.date_range("1/1/2000", freq="T", periods=N // 2), ["a", "b"]] @@ -30,7 +30,7 @@ def setup(self, index_type): elif index_type == "DatetimeIndex": self.idx = pd.date_range("1/1/2000", freq="T", periods=N) elif index_type == "Int64Index": - self.idx = pd.Index(range(N)) + self.idx = pd.Index(range(N), dtype="int64") elif index_type == "PeriodIndex": self.idx = pd.period_range("1/1/2000", freq="T", periods=N) elif index_type == "RangeIndex": @@ -40,9 +40,9 @@ def setup(self, index_type): elif index_type == "TimedeltaIndex": self.idx = pd.TimedeltaIndex(range(N)) elif index_type == "Float64Index": - self.idx = pd.Float64Index(range(N)) + self.idx = pd.Index(range(N), dtype="float64") elif index_type == "UInt64Index": - self.idx = pd.UInt64Index(range(N)) + self.idx = pd.Index(range(N), dtype="uint64") elif index_type == "CategoricalIndex": self.idx = pd.CategoricalIndex(range(N), range(N)) else: @@ -56,9 +56,6 @@ def time_values(self, index_type): def time_shape(self, index_type): self.idx.shape - def time_is_monotonic(self, index_type): - self.idx.is_monotonic - def time_is_monotonic_decreasing(self, index_type): self.idx.is_monotonic_decreasing @@ -73,6 +70,3 @@ def time_engine(self, index_type): def time_inferred_type(self, index_type): self.idx.inferred_type - - def time_is_all_dates(self, index_type): - self.idx.is_all_dates diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index b242de6a17208..bdc8a6a7aa1df 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -4,7 +4,6 @@ from pandas import ( DatetimeIndex, - Float64Index, Index, IntervalIndex, MultiIndex, @@ -17,36 +16,45 @@ class SetOperations: - params = ( - ["datetime", "date_string", "int", "strings"], + ["monotonic", "non_monotonic"], + ["datetime", "date_string", "int", "strings", "ea_int"], ["intersection", "union", "symmetric_difference"], ) - param_names = ["dtype", "method"] + param_names = ["index_structure", "dtype", "method"] - def setup(self, dtype, method): - N = 10 ** 5 + def setup(self, index_structure, dtype, method): + N = 10**5 dates_left = date_range("1/1/2000", periods=N, freq="T") fmt = "%Y-%m-%d %H:%M:%S" date_str_left = Index(dates_left.strftime(fmt)) int_left = Index(np.arange(N)) + ea_int_left = Index(np.arange(N), dtype="Int64") str_left = tm.makeStringIndex(N) + data = { - "datetime": {"left": dates_left, "right": dates_left[:-1]}, - "date_string": {"left": date_str_left, "right": date_str_left[:-1]}, - "int": {"left": int_left, "right": int_left[:-1]}, - "strings": {"left": str_left, "right": str_left[:-1]}, + "datetime": dates_left, + "date_string": date_str_left, + "int": int_left, + "strings": str_left, + "ea_int": ea_int_left, } + + if index_structure == "non_monotonic": + data = {k: mi[::-1] for k, mi in data.items()} + + data = {k: {"left": idx, "right": idx[:-1]} for k, idx in data.items()} + self.left = data[dtype]["left"] self.right = data[dtype]["right"] - def time_operation(self, dtype, method): + def time_operation(self, index_structure, dtype, method): getattr(self.left, method)(self.right) class SetDisjoint: def setup(self): - N = 10 ** 5 + N = 10**5 B = N + 20000 self.datetime_left = DatetimeIndex(range(N)) self.datetime_right = DatetimeIndex(range(N, B)) @@ -55,10 +63,19 @@ def time_datetime_difference_disjoint(self): self.datetime_left.difference(self.datetime_right) +class UnionWithDuplicates: + def setup(self): + self.left = Index(np.repeat(np.arange(1000), 100)) + self.right = Index(np.tile(np.arange(500, 1500), 50)) + + def time_union_with_duplicates(self): + self.left.union(self.right) + + class Range: def setup(self): - self.idx_inc = RangeIndex(start=0, stop=10 ** 7, step=3) - self.idx_dec = RangeIndex(start=10 ** 7, stop=-1, step=-3) + self.idx_inc = RangeIndex(start=0, stop=10**6, step=3) + self.idx_dec = RangeIndex(start=10**6, stop=-1, step=-3) def time_max(self): self.idx_inc.max() @@ -73,15 +90,29 @@ def time_min_trivial(self): self.idx_inc.min() def time_get_loc_inc(self): - self.idx_inc.get_loc(900000) + self.idx_inc.get_loc(900_000) def time_get_loc_dec(self): - self.idx_dec.get_loc(100000) + self.idx_dec.get_loc(100_000) + + def time_iter_inc(self): + for _ in self.idx_inc: + pass + + def time_iter_dec(self): + for _ in self.idx_dec: + pass + + def time_sort_values_asc(self): + self.idx_inc.sort_values() + + def time_sort_values_des(self): + self.idx_inc.sort_values(ascending=False) class IndexEquals: def setup(self): - idx_large_fast = RangeIndex(100000) + idx_large_fast = RangeIndex(100_000) idx_small_slow = date_range(start="1/1/2012", periods=1) self.mi_large_slow = MultiIndex.from_product([idx_large_fast, idx_small_slow]) @@ -93,8 +124,7 @@ def time_non_object_equals_multiindex(self): class IndexAppend: def setup(self): - - N = 10000 + N = 10_000 self.range_idx = RangeIndex(0, 100) self.int_idx = self.range_idx.astype(int) self.obj_idx = self.int_idx.astype(str) @@ -120,12 +150,11 @@ def time_append_obj_list(self): class Indexing: - params = ["String", "Float", "Int"] param_names = ["dtype"] def setup(self, dtype): - N = 10 ** 6 + N = 10**6 self.idx = getattr(tm, f"make{dtype}Index")(N) self.array_mask = (np.arange(N) % 3) == 0 self.series_mask = Series(self.array_mask) @@ -168,9 +197,9 @@ def time_get_loc_non_unique_sorted(self, dtype): class Float64IndexMethod: # GH 13166 def setup(self): - N = 100000 - a = np.arange(N) - self.ind = Float64Index(a * 4.8000000418824129e-08) + N = 100_000 + a = np.arange(N, dtype=np.float64) + self.ind = Index(a * 4.8000000418824129e-08) def time_get_loc(self): self.ind.get_loc(0) @@ -178,7 +207,7 @@ def time_get_loc(self): class IntervalIndexMethod: # GH 24813 - params = [10 ** 3, 10 ** 5] + params = [10**3, 10**5] def setup(self, N): left = np.append(np.arange(N), np.array(0)) @@ -212,7 +241,7 @@ class GC: params = [1, 2, 5] def create_use_drop(self): - idx = Index(list(range(1000 * 1000))) + idx = Index(list(range(1_000_000))) idx._engine def peakmem_gc_instances(self, N): diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 836d3ca8602ec..53827cfcf64fb 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -8,15 +8,13 @@ import numpy as np from pandas import ( + NA, CategoricalIndex, DataFrame, - Float64Index, - IndexSlice, - Int64Index, + Index, IntervalIndex, MultiIndex, Series, - UInt64Index, concat, date_range, option_context, @@ -27,19 +25,18 @@ class NumericSeriesIndexing: - params = [ - (Int64Index, UInt64Index, Float64Index), + (np.int64, np.uint64, np.float64), ("unique_monotonic_inc", "nonunique_monotonic_inc"), ] - param_names = ["index_dtype", "index_structure"] + param_names = ["dtype", "index_structure"] - def setup(self, index, index_structure): - N = 10 ** 6 + def setup(self, dtype, index_structure): + N = 10**6 indices = { - "unique_monotonic_inc": index(range(N)), - "nonunique_monotonic_inc": index( - list(range(55)) + [54] + list(range(55, N - 1)) + "unique_monotonic_inc": Index(range(N), dtype=dtype), + "nonunique_monotonic_inc": Index( + list(range(55)) + [54] + list(range(55, N - 1)), dtype=dtype ), } self.data = Series(np.random.rand(N), index=indices[index_structure]) @@ -86,8 +83,37 @@ def time_loc_slice(self, index, index_structure): self.data.loc[:800000] -class NonNumericSeriesIndexing: +class NumericMaskedIndexing: + monotonic_list = list(range(10**6)) + non_monotonic_list = ( + list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1)) + ) + + params = [ + ("Int64", "UInt64", "Float64"), + (True, False), + ] + param_names = ["dtype", "monotonic"] + + def setup(self, dtype, monotonic): + indices = { + True: Index(self.monotonic_list, dtype=dtype), + False: Index(self.non_monotonic_list, dtype=dtype).append( + Index([NA], dtype=dtype) + ), + } + self.data = indices[monotonic] + self.indexer = np.arange(300, 1_000) + self.data_dups = self.data.append(self.data) + + def time_get_indexer(self, dtype, monotonic): + self.data.get_indexer(self.indexer) + + def time_get_indexer_dups(self, dtype, monotonic): + self.data.get_indexer_for(self.indexer) + +class NonNumericSeriesIndexing: params = [ ("string", "datetime", "period"), ("unique_monotonic_inc", "nonunique_monotonic_inc", "non_monotonic"), @@ -95,7 +121,7 @@ class NonNumericSeriesIndexing: param_names = ["index_dtype", "index_structure"] def setup(self, index, index_structure): - N = 10 ** 6 + N = 10**6 if index == "string": index = tm.makeStringIndex(N) elif index == "datetime": @@ -142,6 +168,12 @@ def setup(self): def time_loc(self): self.df.loc[self.idx_scalar, self.col_scalar] + def time_at(self): + self.df.at[self.idx_scalar, self.col_scalar] + + def time_at_setitem(self): + self.df.at[self.idx_scalar, self.col_scalar] = 0.0 + def time_getitem_scalar(self): self.df[self.col_scalar][self.idx_scalar] @@ -156,70 +188,134 @@ def time_boolean_rows_boolean(self): class DataFrameNumericIndexing: - def setup(self): + params = [ + (np.int64, np.uint64, np.float64), + ("unique_monotonic_inc", "nonunique_monotonic_inc"), + ] + param_names = ["dtype", "index_structure"] + + def setup(self, dtype, index_structure): + N = 10**5 + indices = { + "unique_monotonic_inc": Index(range(N), dtype=dtype), + "nonunique_monotonic_inc": Index( + list(range(55)) + [54] + list(range(55, N - 1)), dtype=dtype + ), + } self.idx_dupe = np.array(range(30)) * 99 - self.df = DataFrame(np.random.randn(100000, 5)) + self.df = DataFrame(np.random.randn(N, 5), index=indices[index_structure]) self.df_dup = concat([self.df, 2 * self.df, 3 * self.df]) - self.bool_indexer = [True] * 50000 + [False] * 50000 + self.bool_indexer = [True] * (N // 2) + [False] * (N - N // 2) - def time_iloc_dups(self): + def time_iloc_dups(self, index, index_structure): self.df_dup.iloc[self.idx_dupe] - def time_loc_dups(self): + def time_loc_dups(self, index, index_structure): self.df_dup.loc[self.idx_dupe] - def time_iloc(self): + def time_iloc(self, index, index_structure): self.df.iloc[:100, 0] - def time_loc(self): + def time_loc(self, index, index_structure): self.df.loc[:100, 0] - def time_bool_indexer(self): + def time_bool_indexer(self, index, index_structure): self.df[self.bool_indexer] class Take: - params = ["int", "datetime"] param_names = ["index"] def setup(self, index): N = 100000 indexes = { - "int": Int64Index(np.arange(N)), + "int": Index(np.arange(N), dtype=np.int64), "datetime": date_range("2011-01-01", freq="S", periods=N), } index = indexes[index] self.s = Series(np.random.rand(N), index=index) - self.indexer = [True, False, True, True, False] * 20000 + self.indexer = np.random.randint(0, N, size=N) def time_take(self, index): self.s.take(self.indexer) class MultiIndexing: - def setup(self): - mi = MultiIndex.from_product([range(1000), range(1000)]) - self.s = Series(np.random.randn(1000000), index=mi) - self.df = DataFrame(self.s) + params = [True, False] + param_names = ["unique_levels"] - n = 100000 - with warnings.catch_warnings(record=True): - self.mdt = DataFrame( - { - "A": np.random.choice(range(10000, 45000, 1000), n), - "B": np.random.choice(range(10, 400), n), - "C": np.random.choice(range(1, 150), n), - "D": np.random.choice(range(10000, 45000), n), - "x": np.random.choice(range(400), n), - "y": np.random.choice(range(25), n), - } - ) - self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000] - self.mdt = self.mdt.set_index(["A", "B", "C", "D"]).sort_index() + def setup(self, unique_levels): + self.nlevels = 2 + if unique_levels: + mi = MultiIndex.from_arrays([range(1000000)] * self.nlevels) + else: + mi = MultiIndex.from_product([range(1000)] * self.nlevels) + self.df = DataFrame(np.random.randn(len(mi)), index=mi) + + self.tgt_slice = slice(200, 800) + self.tgt_null_slice = slice(None) + self.tgt_list = list(range(0, 1000, 10)) + self.tgt_scalar = 500 + + bool_indexer = np.zeros(len(mi), dtype=np.bool_) + bool_indexer[slice(0, len(mi), 100)] = True + self.tgt_bool_indexer = bool_indexer + + def time_loc_partial_key_slice(self, unique_levels): + self.df.loc[self.tgt_slice, :] + + def time_loc_partial_key_null_slice(self, unique_levels): + self.df.loc[self.tgt_null_slice, :] - def time_index_slice(self): - self.mdt.loc[self.idx, :] + def time_loc_partial_key_list(self, unique_levels): + self.df.loc[self.tgt_list, :] + + def time_loc_partial_key_scalar(self, unique_levels): + self.df.loc[self.tgt_scalar, :] + + def time_loc_partial_key_bool_indexer(self, unique_levels): + self.df.loc[self.tgt_bool_indexer, :] + + def time_loc_all_slices(self, unique_levels): + target = tuple([self.tgt_slice] * self.nlevels) + self.df.loc[target, :] + + def time_loc_all_null_slices(self, unique_levels): + target = tuple([self.tgt_null_slice] * self.nlevels) + self.df.loc[target, :] + + def time_loc_all_lists(self, unique_levels): + target = tuple([self.tgt_list] * self.nlevels) + self.df.loc[target, :] + + def time_loc_all_scalars(self, unique_levels): + target = tuple([self.tgt_scalar] * self.nlevels) + self.df.loc[target, :] + + def time_loc_all_bool_indexers(self, unique_levels): + target = tuple([self.tgt_bool_indexer] * self.nlevels) + self.df.loc[target, :] + + def time_loc_slice_plus_null_slice(self, unique_levels): + target = (self.tgt_slice, self.tgt_null_slice) + self.df.loc[target, :] + + def time_loc_null_slice_plus_slice(self, unique_levels): + target = (self.tgt_null_slice, self.tgt_slice) + self.df.loc[target, :] + + def time_xs_level_0(self, unique_levels): + target = self.tgt_scalar + self.df.xs(target, level=0) + + def time_xs_level_1(self, unique_levels): + target = self.tgt_scalar + self.df.xs(target, level=1) + + def time_xs_full_key(self, unique_levels): + target = tuple([self.tgt_scalar] * self.nlevels) + self.df.xs(target) class IntervalIndexing: @@ -241,13 +337,44 @@ def time_loc_list(self, monotonic): monotonic.loc[80000:] -class CategoricalIndexIndexing: +class DatetimeIndexIndexing: + def setup(self): + dti = date_range("2016-01-01", periods=10000, tz="US/Pacific") + dti2 = dti.tz_convert("UTC") + self.dti = dti + self.dti2 = dti2 + + def time_get_indexer_mismatched_tz(self): + # reached via e.g. + # ser = Series(range(len(dti)), index=dti) + # ser[dti2] + self.dti.get_indexer(self.dti2) + + +class SortedAndUnsortedDatetimeIndexLoc: + def setup(self): + dti = date_range("2016-01-01", periods=10000, tz="US/Pacific") + index = np.array(dti) + unsorted_index = index.copy() + unsorted_index[10] = unsorted_index[20] + + self.df_unsorted = DataFrame(index=unsorted_index, data={"a": 1}) + self.df_sort = DataFrame(index=index, data={"a": 1}) + + def time_loc_unsorted(self): + self.df_unsorted.loc["2016-6-11"] + + def time_loc_sorted(self): + self.df_sort.loc["2016-6-11"] + + +class CategoricalIndexIndexing: params = ["monotonic_incr", "monotonic_decr", "non_monotonic"] param_names = ["index"] def setup(self, index): - N = 10 ** 5 + N = 10**5 values = list("a" * N + "b" * N + "c" * N) indices = { "monotonic_incr": CategoricalIndex(values), @@ -255,12 +382,13 @@ def setup(self, index): "non_monotonic": CategoricalIndex(list("abc" * N)), } self.data = indices[index] + self.data_unique = CategoricalIndex([str(i) for i in range(N * 3)]) self.int_scalar = 10000 self.int_list = list(range(10000)) self.cat_scalar = "b" - self.cat_list = ["a", "c"] + self.cat_list = ["1", "3"] def time_getitem_scalar(self, index): self.data[self.int_scalar] @@ -281,7 +409,7 @@ def time_get_loc_scalar(self, index): self.data.get_loc(self.cat_scalar) def time_get_indexer_list(self, index): - self.data.get_indexer(self.cat_list) + self.data_unique.get_indexer(self.cat_list) class MethodLookup: @@ -313,7 +441,7 @@ class IndexSingleRow: param_names = ["unique_cols"] def setup(self, unique_cols): - arr = np.arange(10 ** 7).reshape(-1, 10) + arr = np.arange(10**7).reshape(-1, 10) df = DataFrame(arr) dtypes = ["u1", "u2", "u4", "u8", "i1", "i2", "i4", "i8", "f8", "f4"] for i, d in enumerate(dtypes): @@ -345,33 +473,61 @@ def time_frame_assign_timeseries_index(self): class InsertColumns: def setup(self): - self.N = 10 ** 3 + self.N = 10**3 self.df = DataFrame(index=range(self.N)) + self.df2 = DataFrame(np.random.randn(self.N, 2)) def time_insert(self): - np.random.seed(1234) for i in range(100): self.df.insert(0, i, np.random.randn(self.N), allow_duplicates=True) + def time_insert_middle(self): + # same as time_insert but inserting to a middle column rather than + # front or back (which have fast-paths) + for i in range(100): + self.df2.insert( + 1, "colname", np.random.randn(self.N), allow_duplicates=True + ) + def time_assign_with_setitem(self): - np.random.seed(1234) for i in range(100): self.df[i] = np.random.randn(self.N) + def time_assign_list_like_with_setitem(self): + self.df[list(range(100))] = np.random.randn(self.N, 100) -class ChainIndexing: + def time_assign_list_of_columns_concat(self): + df = DataFrame(np.random.randn(self.N, 100)) + concat([self.df, df], axis=1) + + +class Setitem: + def setup(self): + N = 500_000 + cols = 500 + self.df = DataFrame(np.random.rand(N, cols)) + def time_setitem(self): + self.df[100] = 100 + + def time_setitem_list(self): + self.df[[100, 200, 300]] = 100 + + +class ChainIndexing: params = [None, "warn"] param_names = ["mode"] def setup(self, mode): self.N = 1000000 + self.df = DataFrame({"A": np.arange(self.N), "B": "foo"}) def time_chained_indexing(self, mode): + df = self.df + N = self.N with warnings.catch_warnings(record=True): with option_context("mode.chained_assignment", mode): - df = DataFrame({"A": np.arange(self.N), "B": "foo"}) - df2 = df[df.A > self.N // 2] + df2 = df[df.A > N // 2] df2["C"] = 1.0 diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 44a22dfa77791..6585a4be78dc6 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -1,7 +1,19 @@ +""" +Benchmarks in this file depend mostly on code in _libs/ + +We have to created masked arrays to test the masked engine though. The +array is unpacked on the Cython level. + +If a PR does not edit anything in _libs, it is very unlikely that benchmarks +in this file will be affected. +""" + import numpy as np from pandas._libs import index as libindex +from pandas.core.arrays import BaseMaskedArray + def _get_numeric_engines(): engine_names = [ @@ -23,39 +35,135 @@ def _get_numeric_engines(): ] -class NumericEngineIndexing: +def _get_masked_engines(): + engine_names = [ + ("MaskedInt64Engine", "Int64"), + ("MaskedInt32Engine", "Int32"), + ("MaskedInt16Engine", "Int16"), + ("MaskedInt8Engine", "Int8"), + ("MaskedUInt64Engine", "UInt64"), + ("MaskedUInt32Engine", "UInt32"), + ("MaskedUInt16engine", "UInt16"), + ("MaskedUInt8Engine", "UInt8"), + ("MaskedFloat64Engine", "Float64"), + ("MaskedFloat32Engine", "Float32"), + ] + return [ + (getattr(libindex, engine_name), dtype) + for engine_name, dtype in engine_names + if hasattr(libindex, engine_name) + ] + +class NumericEngineIndexing: params = [ _get_numeric_engines(), ["monotonic_incr", "monotonic_decr", "non_monotonic"], + [True, False], + [10**5, 2 * 10**6], # 2e6 is above SIZE_CUTOFF ] - param_names = ["engine_and_dtype", "index_type"] + param_names = ["engine_and_dtype", "index_type", "unique", "N"] - def setup(self, engine_and_dtype, index_type): + def setup(self, engine_and_dtype, index_type, unique, N): engine, dtype = engine_and_dtype - N = 10 ** 5 - values = list([1] * N + [2] * N + [3] * N) - arr = { - "monotonic_incr": np.array(values, dtype=dtype), - "monotonic_decr": np.array(list(reversed(values)), dtype=dtype), - "non_monotonic": np.array([1, 2, 3] * N, dtype=dtype), - }[index_type] - self.data = engine(lambda: arr, len(arr)) + if index_type == "monotonic_incr": + if unique: + arr = np.arange(N * 3, dtype=dtype) + else: + values = list([1] * N + [2] * N + [3] * N) + arr = np.array(values, dtype=dtype) + elif index_type == "monotonic_decr": + if unique: + arr = np.arange(N * 3, dtype=dtype)[::-1] + else: + values = list([1] * N + [2] * N + [3] * N) + arr = np.array(values, dtype=dtype)[::-1] + else: + assert index_type == "non_monotonic" + if unique: + arr = np.empty(N * 3, dtype=dtype) + arr[:N] = np.arange(N * 2, N * 3, dtype=dtype) + arr[N:] = np.arange(N * 2, dtype=dtype) + else: + arr = np.array([1, 2, 3] * N, dtype=dtype) + + self.data = engine(arr) # code belows avoids populating the mapping etc. while timing. self.data.get_loc(2) - def time_get_loc(self, engine_and_dtype, index_type): + self.key_middle = arr[len(arr) // 2] + self.key_early = arr[2] + + def time_get_loc(self, engine_and_dtype, index_type, unique, N): + self.data.get_loc(self.key_early) + + def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N): + # searchsorted performance may be different near the middle of a range + # vs near an endpoint + self.data.get_loc(self.key_middle) + + +class MaskedNumericEngineIndexing: + params = [ + _get_masked_engines(), + ["monotonic_incr", "monotonic_decr", "non_monotonic"], + [True, False], + [10**5, 2 * 10**6], # 2e6 is above SIZE_CUTOFF + ] + param_names = ["engine_and_dtype", "index_type", "unique", "N"] + + def setup(self, engine_and_dtype, index_type, unique, N): + engine, dtype = engine_and_dtype + + if index_type == "monotonic_incr": + if unique: + arr = np.arange(N * 3, dtype=dtype.lower()) + else: + values = list([1] * N + [2] * N + [3] * N) + arr = np.array(values, dtype=dtype.lower()) + mask = np.zeros(N * 3, dtype=np.bool_) + elif index_type == "monotonic_decr": + if unique: + arr = np.arange(N * 3, dtype=dtype.lower())[::-1] + else: + values = list([1] * N + [2] * N + [3] * N) + arr = np.array(values, dtype=dtype.lower())[::-1] + mask = np.zeros(N * 3, dtype=np.bool_) + else: + assert index_type == "non_monotonic" + if unique: + arr = np.zeros(N * 3, dtype=dtype.lower()) + arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower()) + arr[N:] = np.arange(N * 2, dtype=dtype.lower()) + + else: + arr = np.array([1, 2, 3] * N, dtype=dtype.lower()) + mask = np.zeros(N * 3, dtype=np.bool_) + mask[-1] = True + + self.data = engine(BaseMaskedArray(arr, mask)) + # code belows avoids populating the mapping etc. while timing. self.data.get_loc(2) + self.key_middle = arr[len(arr) // 2] + self.key_early = arr[2] -class ObjectEngineIndexing: + def time_get_loc(self, engine_and_dtype, index_type, unique, N): + self.data.get_loc(self.key_early) + + def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N): + # searchsorted performance may be different near the middle of a range + # vs near an endpoint + self.data.get_loc(self.key_middle) + +class ObjectEngineIndexing: params = [("monotonic_incr", "monotonic_decr", "non_monotonic")] param_names = ["index_type"] def setup(self, index_type): - N = 10 ** 5 + N = 10**5 values = list("a" * N + "b" * N + "c" * N) arr = { "monotonic_incr": np.array(values, dtype=object), @@ -63,7 +171,7 @@ def setup(self, index_type): "non_monotonic": np.array(list("abc") * N, dtype=object), }[index_type] - self.data = libindex.ObjectEngine(lambda: arr, len(arr)) + self.data = libindex.ObjectEngine(arr) # code belows avoids populating the mapping etc. while timing. self.data.get_loc("b") diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 40b064229ae49..476ff14dcc92a 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,12 +1,29 @@ +""" +The functions benchmarked in this file depend _almost_ exclusively on +_libs, but not in a way that is easy to formalize. + +If a PR does not change anything in pandas/_libs/ or pandas/core/tools/, then +it is likely that these benchmarks will be unaffected. +""" + import numpy as np -from pandas import Series, to_numeric +from pandas import ( + NaT, + Series, + date_range, + to_datetime, + to_numeric, + to_timedelta, +) -from .pandas_vb_common import lib, tm +from .pandas_vb_common import ( + lib, + tm, +) class ToNumeric: - params = ["ignore", "coerce"] param_names = ["errors"] @@ -27,7 +44,6 @@ def time_from_str(self, errors): class ToNumericDowncast: - param_names = ["dtype", "downcast"] params = [ [ @@ -42,7 +58,7 @@ class ToNumericDowncast: ] N = 500000 - N2 = int(N / 2) + N2 = N // 2 data_dict = { "string-int": ["1"] * N2 + [2] * N2, @@ -63,9 +79,12 @@ def time_downcast(self, dtype, downcast): class MaybeConvertNumeric: + # maybe_convert_numeric depends _exclusively_ on _libs, could + # go in benchmarks/libs.py + def setup_cache(self): - N = 10 ** 6 - arr = np.repeat([2 ** 63], N) + np.arange(N).astype("uint64") + N = 10**6 + arr = np.repeat([2**63], N) + np.arange(N).astype("uint64") data = arr.astype(object) data[1::2] = arr[1::2].astype(str) data[-1] = -1 @@ -75,4 +94,225 @@ def time_convert(self, data): lib.maybe_convert_numeric(data, set(), coerce_numeric=False) +class MaybeConvertObjects: + # maybe_convert_objects depends _almost_ exclusively on _libs, but + # does have some run-time imports from outside of _libs + + def setup(self): + N = 10**5 + + data = list(range(N)) + data[0] = NaT + data = np.array(data) + self.data = data + + def time_maybe_convert_objects(self): + lib.maybe_convert_objects(self.data) + + +class ToDatetimeFromIntsFloats: + def setup(self): + self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64") + self.ts_sec_uint = Series(range(1521080307, 1521685107), dtype="uint64") + self.ts_sec_float = self.ts_sec.astype("float64") + + self.ts_nanosec = 1_000_000 * self.ts_sec + self.ts_nanosec_uint = 1_000_000 * self.ts_sec_uint + self.ts_nanosec_float = self.ts_nanosec.astype("float64") + + # speed of int64, uint64 and float64 paths should be comparable + + def time_nanosec_int64(self): + to_datetime(self.ts_nanosec, unit="ns") + + def time_nanosec_uint64(self): + to_datetime(self.ts_nanosec_uint, unit="ns") + + def time_nanosec_float64(self): + to_datetime(self.ts_nanosec_float, unit="ns") + + def time_sec_uint64(self): + to_datetime(self.ts_sec_uint, unit="s") + + def time_sec_int64(self): + to_datetime(self.ts_sec, unit="s") + + def time_sec_float64(self): + to_datetime(self.ts_sec_float, unit="s") + + +class ToDatetimeYYYYMMDD: + def setup(self): + rng = date_range(start="1/1/2000", periods=10000, freq="D") + self.stringsD = Series(rng.strftime("%Y%m%d")) + + def time_format_YYYYMMDD(self): + to_datetime(self.stringsD, format="%Y%m%d") + + +class ToDatetimeCacheSmallCount: + params = ([True, False], [50, 500, 5000, 100000]) + param_names = ["cache", "count"] + + def setup(self, cache, count): + rng = date_range(start="1/1/1971", periods=count) + self.unique_date_strings = rng.strftime("%Y-%m-%d").tolist() + + def time_unique_date_strings(self, cache, count): + to_datetime(self.unique_date_strings, cache=cache) + + +class ToDatetimeISO8601: + def setup(self): + rng = date_range(start="1/1/2000", periods=20000, freq="H") + self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() + self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist() + self.strings_tz_space = [ + x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng + ] + self.strings_zero_tz = [x.strftime("%Y-%m-%d %H:%M:%S") + "Z" for x in rng] + + def time_iso8601(self): + to_datetime(self.strings) + + def time_iso8601_nosep(self): + to_datetime(self.strings_nosep) + + def time_iso8601_format(self): + to_datetime(self.strings, format="%Y-%m-%d %H:%M:%S") + + def time_iso8601_format_no_sep(self): + to_datetime(self.strings_nosep, format="%Y%m%d %H:%M:%S") + + def time_iso8601_tz_spaceformat(self): + to_datetime(self.strings_tz_space) + + def time_iso8601_infer_zero_tz_fromat(self): + # GH 41047 + to_datetime(self.strings_zero_tz, infer_datetime_format=True) + + +class ToDatetimeNONISO8601: + def setup(self): + N = 10000 + half = N // 2 + ts_string_1 = "March 1, 2018 12:00:00+0400" + ts_string_2 = "March 1, 2018 12:00:00+0500" + self.same_offset = [ts_string_1] * N + self.diff_offset = [ts_string_1] * half + [ts_string_2] * half + + def time_same_offset(self): + to_datetime(self.same_offset) + + def time_different_offset(self): + to_datetime(self.diff_offset) + + +class ToDatetimeFormatQuarters: + def setup(self): + self.s = Series(["2Q2005", "2Q05", "2005Q1", "05Q1"] * 10000) + + def time_infer_quarter(self): + to_datetime(self.s) + + +class ToDatetimeFormat: + def setup(self): + N = 100000 + self.s = Series(["19MAY11", "19MAY11:00:00:00"] * N) + self.s2 = self.s.str.replace(":\\S+$", "", regex=True) + + self.same_offset = ["10/11/2018 00:00:00.045-07:00"] * N + self.diff_offset = [ + f"10/11/2018 00:00:00.045-0{offset}:00" for offset in range(10) + ] * (N // 10) + + def time_exact(self): + to_datetime(self.s2, format="%d%b%y") + + def time_no_exact(self): + to_datetime(self.s, format="%d%b%y", exact=False) + + def time_same_offset(self): + to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z") + + def time_different_offset(self): + to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z") + + def time_same_offset_to_utc(self): + to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) + + def time_different_offset_to_utc(self): + to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) + + +class ToDatetimeCache: + params = [True, False] + param_names = ["cache"] + + def setup(self, cache): + N = 10000 + self.unique_numeric_seconds = list(range(N)) + self.dup_numeric_seconds = [1000] * N + self.dup_string_dates = ["2000-02-11"] * N + self.dup_string_with_tz = ["2000-02-11 15:00:00-0800"] * N + + def time_unique_seconds_and_unit(self, cache): + to_datetime(self.unique_numeric_seconds, unit="s", cache=cache) + + def time_dup_seconds_and_unit(self, cache): + to_datetime(self.dup_numeric_seconds, unit="s", cache=cache) + + def time_dup_string_dates(self, cache): + to_datetime(self.dup_string_dates, cache=cache) + + def time_dup_string_dates_and_format(self, cache): + to_datetime(self.dup_string_dates, format="%Y-%m-%d", cache=cache) + + def time_dup_string_tzoffset_dates(self, cache): + to_datetime(self.dup_string_with_tz, cache=cache) + + +# GH 43901 +class ToDatetimeInferDatetimeFormat: + def setup(self): + rng = date_range(start="1/1/2000", periods=100000, freq="H") + self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() + + def time_infer_datetime_format(self): + to_datetime(self.strings, infer_datetime_format=True) + + +class ToTimedelta: + def setup(self): + self.ints = np.random.randint(0, 60, size=10000) + self.str_days = [] + self.str_seconds = [] + for i in self.ints: + self.str_days.append(f"{i} days") + self.str_seconds.append(f"00:00:{i:02d}") + + def time_convert_int(self): + to_timedelta(self.ints, unit="s") + + def time_convert_string_days(self): + to_timedelta(self.str_days) + + def time_convert_string_seconds(self): + to_timedelta(self.str_seconds) + + +class ToTimedeltaErrors: + params = ["coerce", "ignore"] + param_names = ["errors"] + + def setup(self, errors): + ints = np.random.randint(0, 60, size=10000) + self.arr = [f"{i} days" for i in ints] + self.arr[-1] = "apple" + + def time_convert(self, errors): + to_timedelta(self.arr, errors=errors) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 9bcd125f56bbb..cde42d99d49a0 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -1,16 +1,28 @@ -from io import StringIO +from io import ( + BytesIO, + StringIO, +) import random import string import numpy as np -from pandas import Categorical, DataFrame, date_range, read_csv, to_datetime +from pandas import ( + Categorical, + DataFrame, + concat, + date_range, + read_csv, + to_datetime, +) -from ..pandas_vb_common import BaseIO, tm +from ..pandas_vb_common import ( + BaseIO, + tm, +) class ToCSV(BaseIO): - fname = "__test__.csv" params = ["wide", "long", "mixed"] param_names = ["kind"] @@ -42,8 +54,26 @@ def time_frame(self, kind): self.df.to_csv(self.fname) -class ToCSVDatetime(BaseIO): +class ToCSVMultiIndexUnusedLevels(BaseIO): + fname = "__test__.csv" + + def setup(self): + df = DataFrame({"a": np.random.randn(100_000), "b": 1, "c": 1}) + self.df = df.set_index(["a", "b"]) + self.df_unused_levels = self.df.iloc[:10_000] + self.df_single_index = df.set_index(["a"]).iloc[:10_000] + + def time_full_frame(self): + self.df.to_csv(self.fname) + + def time_sliced_frame(self): + self.df_unused_levels.to_csv(self.fname) + + def time_single_index_frame(self): + self.df_single_index.to_csv(self.fname) + +class ToCSVDatetime(BaseIO): fname = "__test__.csv" def setup(self): @@ -54,8 +84,21 @@ def time_frame_date_formatting(self): self.data.to_csv(self.fname, date_format="%Y%m%d") -class ToCSVDatetimeBig(BaseIO): +class ToCSVDatetimeIndex(BaseIO): + fname = "__test__.csv" + + def setup(self): + rng = date_range("2000", periods=100_000, freq="S") + self.data = DataFrame({"a": 1}, index=rng) + + def time_frame_date_formatting_index(self): + self.data.to_csv(self.fname, date_format="%Y-%m-%d %H:%M:%S") + + def time_frame_date_no_format_index(self): + self.data.to_csv(self.fname) + +class ToCSVDatetimeBig(BaseIO): fname = "__test__.csv" timeout = 1500 params = [1000, 10000, 100000] @@ -76,6 +119,53 @@ def time_frame(self, obs): self.data.to_csv(self.fname) +class ToCSVIndexes(BaseIO): + fname = "__test__.csv" + + @staticmethod + def _create_df(rows, cols): + index_cols = { + "index1": np.random.randint(0, rows, rows), + "index2": np.full(rows, 1, dtype=int), + "index3": np.full(rows, 1, dtype=int), + } + data_cols = { + f"col{i}": np.random.uniform(0, 100000.0, rows) for i in range(cols) + } + df = DataFrame({**index_cols, **data_cols}) + return df + + def setup(self): + ROWS = 100000 + COLS = 5 + # For tests using .head(), create an initial dataframe with this many times + # more rows + HEAD_ROW_MULTIPLIER = 10 + + self.df_standard_index = self._create_df(ROWS, COLS) + + self.df_custom_index_then_head = ( + self._create_df(ROWS * HEAD_ROW_MULTIPLIER, COLS) + .set_index(["index1", "index2", "index3"]) + .head(ROWS) + ) + + self.df_head_then_custom_index = ( + self._create_df(ROWS * HEAD_ROW_MULTIPLIER, COLS) + .head(ROWS) + .set_index(["index1", "index2", "index3"]) + ) + + def time_standard_index(self): + self.df_standard_index.to_csv(self.fname) + + def time_multiindex(self): + self.df_head_then_custom_index.to_csv(self.fname) + + def time_head_of_multiindex(self): + self.df_custom_index_then_head.to_csv(self.fname) + + class StringIORewind: def data(self, stringio_object): stringio_object.seek(0) @@ -83,7 +173,6 @@ def data(self, stringio_object): class ReadCSVDInferDatetimeFormat(StringIORewind): - params = ([True, False], ["custom", "iso8601", "ymd"]) param_names = ["infer_datetime_format", "format"] @@ -108,7 +197,6 @@ def time_read_csv(self, infer_datetime_format, format): class ReadCSVConcatDatetime(StringIORewind): - iso8601 = "%Y-%m-%d %H:%M:%S" def setup(self): @@ -126,7 +214,6 @@ def time_read_csv(self): class ReadCSVConcatDatetimeBadDateValue(StringIORewind): - params = (["nan", "0", ""],) param_names = ["bad_date_value"] @@ -144,12 +231,11 @@ def time_read_csv(self, bad_date_value): class ReadCSVSkipRows(BaseIO): - fname = "__test__.csv" - params = [None, 10000] - param_names = ["skiprows"] + params = ([None, 10000], ["c", "python", "pyarrow"]) + param_names = ["skiprows", "engine"] - def setup(self, skiprows): + def setup(self, skiprows, engine): N = 20000 index = tm.makeStringIndex(N) df = DataFrame( @@ -164,14 +250,14 @@ def setup(self, skiprows): ) df.to_csv(self.fname) - def time_skipprows(self, skiprows): - read_csv(self.fname, skiprows=skiprows) + def time_skipprows(self, skiprows, engine): + read_csv(self.fname, skiprows=skiprows, engine=engine) class ReadUint64Integers(StringIORewind): def setup(self): - self.na_values = [2 ** 63 + 500] - arr = np.arange(10000).astype("uint64") + 2 ** 63 + self.na_values = [2**63 + 500] + arr = np.arange(10000).astype("uint64") + 2**63 self.data1 = StringIO("\n".join(arr.astype(str).tolist())) arr = arr.astype(object) arr[500] = -1 @@ -190,12 +276,11 @@ def time_read_uint64_na_values(self): class ReadCSVThousands(BaseIO): - fname = "__test__.csv" - params = ([",", "|"], [None, ","]) - param_names = ["sep", "thousands"] + params = ([",", "|"], [None, ","], ["c", "python"]) + param_names = ["sep", "thousands", "engine"] - def setup(self, sep, thousands): + def setup(self, sep, thousands, engine): N = 10000 K = 8 data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)) @@ -206,29 +291,32 @@ def setup(self, sep, thousands): df = df.applymap(lambda x: fmt.format(x)) df.to_csv(self.fname, sep=sep) - def time_thousands(self, sep, thousands): - read_csv(self.fname, sep=sep, thousands=thousands) + def time_thousands(self, sep, thousands, engine): + read_csv(self.fname, sep=sep, thousands=thousands, engine=engine) class ReadCSVComment(StringIORewind): - def setup(self): + params = ["c", "python"] + param_names = ["engine"] + + def setup(self, engine): data = ["A,B,C"] + (["1,2,3 # comment"] * 100000) self.StringIO_input = StringIO("\n".join(data)) - def time_comment(self): + def time_comment(self, engine): read_csv( self.data(self.StringIO_input), comment="#", header=None, names=list("abc") ) class ReadCSVFloatPrecision(StringIORewind): - params = ([",", ";"], [".", "_"], [None, "high", "round_trip"]) param_names = ["sep", "decimal", "float_precision"] def setup(self, sep, decimal, float_precision): floats = [ - "".join(random.choice(string.digits) for _ in range(28)) for _ in range(15) + "".join([random.choice(string.digits) for _ in range(28)]) + for _ in range(15) ] rows = sep.join([f"0{decimal}" + "{}"] * 3) + "\n" data = rows * 5 @@ -255,25 +343,46 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): ) -class ReadCSVCategorical(BaseIO): +class ReadCSVEngine(StringIORewind): + params = ["c", "python", "pyarrow"] + param_names = ["engine"] + + def setup(self, engine): + data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000) + self.StringIO_input = StringIO("\n".join(data)) + # simulate reading from file + self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8")) + def time_read_stringcsv(self, engine): + read_csv(self.data(self.StringIO_input), engine=engine) + + def time_read_bytescsv(self, engine): + read_csv(self.data(self.BytesIO_input), engine=engine) + + +class ReadCSVCategorical(BaseIO): fname = "__test__.csv" + params = ["c", "python"] + param_names = ["engine"] - def setup(self): + def setup(self, engine): N = 100000 group1 = ["aaaaaaaa", "bbbbbbb", "cccccccc", "dddddddd", "eeeeeeee"] df = DataFrame(np.random.choice(group1, (N, 3)), columns=list("abc")) df.to_csv(self.fname, index=False) - def time_convert_post(self): - read_csv(self.fname).apply(Categorical) + def time_convert_post(self, engine): + read_csv(self.fname, engine=engine).apply(Categorical) - def time_convert_direct(self): - read_csv(self.fname, dtype="category") + def time_convert_direct(self, engine): + read_csv(self.fname, engine=engine, dtype="category") class ReadCSVParseDates(StringIORewind): - def setup(self): + params = ["c", "python"] + param_names = ["engine"] + + def setup(self, engine): data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n @@ -284,18 +393,20 @@ def setup(self): data = data.format(*two_cols) self.StringIO_input = StringIO(data) - def time_multiple_date(self): + def time_multiple_date(self, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, names=list(string.digits[:9]), parse_dates=[[1, 2], [1, 3]], ) - def time_baseline(self): + def time_baseline(self, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, parse_dates=[1], @@ -304,17 +415,18 @@ def time_baseline(self): class ReadCSVCachedParseDates(StringIORewind): - params = ([True, False],) - param_names = ["do_cache"] + params = ([True, False], ["c", "python"]) + param_names = ["do_cache", "engine"] - def setup(self, do_cache): - data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10 + def setup(self, do_cache, engine): + data = ("\n".join([f"10/{year}" for year in range(2000, 2100)]) + "\n") * 10 self.StringIO_input = StringIO(data) - def time_read_csv_cached(self, do_cache): + def time_read_csv_cached(self, do_cache, engine): try: read_csv( self.data(self.StringIO_input), + engine=engine, header=None, parse_dates=[0], cache_dates=do_cache, @@ -325,41 +437,43 @@ def time_read_csv_cached(self, do_cache): class ReadCSVMemoryGrowth(BaseIO): - chunksize = 20 num_rows = 1000 fname = "__test__.csv" + params = ["c", "python"] + param_names = ["engine"] - def setup(self): + def setup(self, engine): with open(self.fname, "w") as f: for i in range(self.num_rows): f.write(f"{i}\n") - def mem_parser_chunks(self): + def mem_parser_chunks(self, engine): # see gh-24805. - result = read_csv(self.fname, chunksize=self.chunksize) + result = read_csv(self.fname, chunksize=self.chunksize, engine=engine) for _ in result: pass class ReadCSVParseSpecialDate(StringIORewind): - params = (["mY", "mdY", "hm"],) - param_names = ["value"] + params = (["mY", "mdY", "hm"], ["c", "python"]) + param_names = ["value", "engine"] objects = { "mY": "01-2019\n10-2019\n02/2000\n", "mdY": "12/02/2010\n", "hm": "21:34\n", } - def setup(self, value): + def setup(self, value, engine): count_elem = 10000 data = self.objects[value] * count_elem self.StringIO_input = StringIO(data) - def time_read_special_date(self, value): + def time_read_special_date(self, value, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, names=["Date"], @@ -367,6 +481,33 @@ def time_read_special_date(self, value): ) +class ReadCSVMemMapUTF8: + fname = "__test__.csv" + number = 5 + + def setup(self): + lines = [] + line_length = 128 + start_char = " " + end_char = "\U00010080" + # This for loop creates a list of 128-char strings + # consisting of consecutive Unicode chars + for lnum in range(ord(start_char), ord(end_char), line_length): + line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n" + try: + line.encode("utf-8") + except UnicodeEncodeError: + # Some 16-bit words are not valid Unicode chars and must be skipped + continue + lines.append(line) + df = DataFrame(lines) + df = concat([df for n in range(100)], ignore_index=True) + df.to_csv(self.fname, index=False, header=False, encoding="utf-8") + + def time_read_memmapped_utf8(self): + read_csv(self.fname, header=None, memory_map=True, encoding="utf-8", engine="c") + + class ParseDateComparison(StringIORewind): params = ([False, True],) param_names = ["cache_dates"] @@ -404,4 +545,29 @@ def time_to_datetime_format_DD_MM_YYYY(self, cache_dates): to_datetime(df["date"], cache=cache_dates, format="%d-%m-%Y") +class ReadCSVIndexCol(StringIORewind): + def setup(self): + count_elem = 100_000 + data = "a,b\n" + "1,2\n" * count_elem + self.StringIO_input = StringIO(data) + + def time_read_csv_index_col(self): + read_csv(self.StringIO_input, index_col="a") + + +class ReadCSVDatePyarrowEngine(StringIORewind): + def setup(self): + count_elem = 100_000 + data = "a\n" + "2019-12-31\n" * count_elem + self.StringIO_input = StringIO(data) + + def time_read_csv_index_col(self): + read_csv( + self.StringIO_input, + parse_dates=["a"], + engine="pyarrow", + dtype_backend="pyarrow", + ) + + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 80af2cff41769..093a35a20dc5a 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -2,10 +2,19 @@ import numpy as np from odf.opendocument import OpenDocumentSpreadsheet -from odf.table import Table, TableCell, TableRow +from odf.table import ( + Table, + TableCell, + TableRow, +) from odf.text import P -from pandas import DataFrame, ExcelWriter, date_range, read_excel +from pandas import ( + DataFrame, + ExcelWriter, + date_range, + read_excel, +) from ..pandas_vb_common import tm @@ -23,8 +32,7 @@ def _generate_dataframe(): class WriteExcel: - - params = ["openpyxl", "xlsxwriter", "xlwt"] + params = ["openpyxl", "xlsxwriter"] param_names = ["engine"] def setup(self, engine): @@ -33,14 +41,30 @@ def setup(self, engine): def time_write_excel(self, engine): bio = BytesIO() bio.seek(0) - writer = ExcelWriter(bio, engine=engine) - self.df.to_excel(writer, sheet_name="Sheet1") - writer.save() + with ExcelWriter(bio, engine=engine) as writer: + self.df.to_excel(writer, sheet_name="Sheet1") -class ReadExcel: +class WriteExcelStyled: + params = ["openpyxl", "xlsxwriter"] + param_names = ["engine"] + + def setup(self, engine): + self.df = _generate_dataframe() + + def time_write_excel_style(self, engine): + bio = BytesIO() + bio.seek(0) + with ExcelWriter(bio, engine=engine) as writer: + df_style = self.df.style + df_style.applymap(lambda x: "border: red 1px solid;") + df_style.applymap(lambda x: "color: blue") + df_style.applymap(lambda x: "border-color: green black", subset=["float1"]) + df_style.to_excel(writer, sheet_name="Sheet1") - params = ["xlrd", "openpyxl", "odf"] + +class ReadExcel: + params = ["openpyxl", "odf"] param_names = ["engine"] fname_excel = "spreadsheet.xlsx" fname_odf = "spreadsheet.ods" @@ -66,8 +90,20 @@ def setup_cache(self): self._create_odf() def time_read_excel(self, engine): - fname = self.fname_odf if engine == "odf" else self.fname_excel + if engine == "odf": + fname = self.fname_odf + else: + fname = self.fname_excel read_excel(fname, engine=engine) +class ReadExcelNRows(ReadExcel): + def time_read_excel(self, engine): + if engine == "odf": + fname = self.fname_odf + else: + fname = self.fname_excel + read_excel(fname, engine=engine, nrows=10) + + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index 4ca399a293a4b..f3e417e717609 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -1,8 +1,16 @@ import numpy as np -from pandas import DataFrame, HDFStore, date_range, read_hdf +from pandas import ( + DataFrame, + HDFStore, + date_range, + read_hdf, +) -from ..pandas_vb_common import BaseIO, tm +from ..pandas_vb_common import ( + BaseIO, + tm, +) class HDFStoreDataFrame(BaseIO): @@ -35,7 +43,7 @@ def setup(self): np.random.randn(N, 100), index=date_range("1/1/2000", periods=N) ) self.df_dc = DataFrame( - np.random.randn(N, 10), columns=["C%03d" % i for i in range(10)] + np.random.randn(N, 10), columns=[f"C{i:03d}" for i in range(10)] ) self.fname = "__test__.h5" @@ -104,7 +112,6 @@ def time_store_info(self): class HDF(BaseIO): - params = ["table", "fixed"] param_names = ["format"] @@ -120,9 +127,17 @@ def setup(self, format): self.df["object"] = tm.makeStringIndex(N) self.df.to_hdf(self.fname, "df", format=format) + # Numeric df + self.df1 = self.df.copy() + self.df1 = self.df1.reset_index() + self.df1.to_hdf(self.fname, "df1", format=format) + def time_read_hdf(self, format): read_hdf(self.fname, "df") + def peakmem_read_hdf(self, format): + read_hdf(self.fname, "df") + def time_write_hdf(self, format): self.df.to_hdf(self.fname, "df", format=format) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index ed0fb5b8fe342..9eaffddd8b87f 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -2,13 +2,22 @@ import numpy as np -from pandas import DataFrame, concat, date_range, read_json, timedelta_range +from pandas import ( + DataFrame, + concat, + date_range, + json_normalize, + read_json, + timedelta_range, +) -from ..pandas_vb_common import BaseIO, tm +from ..pandas_vb_common import ( + BaseIO, + tm, +) class ReadJSON(BaseIO): - fname = "__test__.json" params = (["split", "index", "records"], ["int", "datetime"]) param_names = ["orient", "index"] @@ -31,7 +40,6 @@ def time_read_json(self, orient, index): class ReadJSONLines(BaseIO): - fname = "__test_lines__.json" params = ["int", "datetime"] param_names = ["index"] @@ -68,8 +76,28 @@ def peakmem_read_json_lines_nrows(self, index): read_json(self.fname, orient="records", lines=True, nrows=15000) -class ToJSON(BaseIO): +class NormalizeJSON(BaseIO): + fname = "__test__.json" + params = [ + ["split", "columns", "index", "values", "records"], + ["df", "df_date_idx", "df_td_int_ts", "df_int_floats", "df_int_float_str"], + ] + param_names = ["orient", "frame"] + + def setup(self, orient, frame): + data = { + "hello": ["thisisatest", 999898, "mixed types"], + "nest1": {"nest2": {"nest3": "nest3_value", "nest3_int": 3445}}, + "nest1_list": {"nest2": ["blah", 32423, 546456.876, 92030234]}, + "hello2": "string", + } + self.data = [data for i in range(10000)] + + def time_normalize_json(self, orient, frame): + json_normalize(self.data) + +class ToJSON(BaseIO): fname = "__test__.json" params = [ ["split", "columns", "index", "values", "records"], @@ -78,7 +106,7 @@ class ToJSON(BaseIO): param_names = ["orient", "frame"] def setup(self, orient, frame): - N = 10 ** 5 + N = 10**5 ncols = 5 index = date_range("20000101", periods=N, freq="H") timedeltas = timedelta_range(start=1, periods=N, freq="s") @@ -141,15 +169,19 @@ def time_to_json(self, orient, frame): def peakmem_to_json(self, orient, frame): getattr(self, frame).to_json(self.fname, orient=orient) - def time_to_json_wide(self, orient, frame): + +class ToJSONWide(ToJSON): + def setup(self, orient, frame): + super().setup(orient, frame) base_df = getattr(self, frame).copy() - df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1) - df.to_json(self.fname, orient=orient) + df_wide = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1) + self.df_wide = df_wide + + def time_to_json_wide(self, orient, frame): + self.df_wide.to_json(self.fname, orient=orient) def peakmem_to_json_wide(self, orient, frame): - base_df = getattr(self, frame).copy() - df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1) - df.to_json(self.fname, orient=orient) + self.df_wide.to_json(self.fname, orient=orient) class ToJSONISO(BaseIO): @@ -158,7 +190,7 @@ class ToJSONISO(BaseIO): param_names = ["orient"] def setup(self, orient): - N = 10 ** 5 + N = 10**5 index = date_range("20000101", periods=N, freq="H") timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") @@ -177,11 +209,10 @@ def time_iso_format(self, orient): class ToJSONLines(BaseIO): - fname = "__test__.json" def setup(self): - N = 10 ** 5 + N = 10**5 ncols = 5 index = date_range("20000101", periods=N, freq="H") timedeltas = timedelta_range(start=1, periods=N, freq="s") @@ -259,7 +290,8 @@ def time_float_longint_str_lines(self): class ToJSONMem: def setup_cache(self): df = DataFrame([[1]]) - frames = {"int": df, "float": df.astype(float)} + df2 = DataFrame(range(8), date_range("1/1/2000", periods=8, freq="T")) + frames = {"int": df, "float": df.astype(float), "datetime": df2} return frames @@ -273,5 +305,10 @@ def peakmem_float(self, frames): for _ in range(100_000): df.to_json() + def peakmem_time(self, frames): + df = frames["datetime"] + for _ in range(10_000): + df.to_json(orient="table") + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index ec3eddfff7184..1078837a8e395 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -2,8 +2,8 @@ try: from pandas._libs.tslibs.parsing import ( - concat_date_cols, _does_string_look_like_datetime, + concat_date_cols, ) except ImportError: # Avoid whole benchmark suite import failure on asv (currently 0.4) @@ -11,7 +11,6 @@ class DoesStringLookLikeDatetime: - params = (["2Q2005", "0.0", "10000"],) param_names = ["value"] @@ -24,7 +23,6 @@ def time_check_datetimes(self, value): class ConcatDateCols: - params = ([1234567890, "AAAA"], [1, 2]) param_names = ["value", "dim"] diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 4ca9a82ae4827..c71cdcdcc5c59 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -1,8 +1,15 @@ import numpy as np -from pandas import DataFrame, date_range, read_pickle +from pandas import ( + DataFrame, + date_range, + read_pickle, +) -from ..pandas_vb_common import BaseIO, tm +from ..pandas_vb_common import ( + BaseIO, + tm, +) class Pickle(BaseIO): @@ -24,5 +31,11 @@ def time_read_pickle(self): def time_write_pickle(self): self.df.to_pickle(self.fname) + def peakmem_read_pickle(self): + read_pickle(self.fname) + + def peakmem_write_pickle(self): + self.df.to_pickle(self.fname) + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/sas.py b/asv_bench/benchmarks/io/sas.py index 369b79641dbc4..411e5b6099f76 100644 --- a/asv_bench/benchmarks/io/sas.py +++ b/asv_bench/benchmarks/io/sas.py @@ -1,30 +1,23 @@ -import os +from pathlib import Path from pandas import read_sas +ROOT = Path(__file__).parents[3] / "pandas" / "tests" / "io" / "sas" / "data" + class SAS: + def time_read_sas7bdat(self): + read_sas(ROOT / "test1.sas7bdat") - params = ["sas7bdat", "xport"] - param_names = ["format"] + def time_read_xpt(self): + read_sas(ROOT / "paxraw_d_short.xpt") - def setup(self, format): - # Read files that are located in 'pandas/tests/io/sas/data' - files = {"sas7bdat": "test1.sas7bdat", "xport": "paxraw_d_short.xpt"} - file = files[format] - paths = [ - os.path.dirname(__file__), - "..", - "..", - "..", - "pandas", - "tests", - "io", - "sas", - "data", - file, - ] - self.f = os.path.join(*paths) + def time_read_sas7bdat_2(self): + next(read_sas(ROOT / "0x00controlbyte.sas7bdat.bz2", chunksize=11000)) - def time_read_sas(self, format): - read_sas(self.f, format=format) + def time_read_sas7bdat_2_chunked(self): + for i, _ in enumerate( + read_sas(ROOT / "0x00controlbyte.sas7bdat.bz2", chunksize=1000) + ): + if i == 10: + break diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index b71bb832280b9..6f893ee72d918 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -3,13 +3,17 @@ import numpy as np from sqlalchemy import create_engine -from pandas import DataFrame, date_range, read_sql_query, read_sql_table +from pandas import ( + DataFrame, + date_range, + read_sql_query, + read_sql_table, +) from ..pandas_vb_common import tm class SQL: - params = ["sqlalchemy", "sqlite"] param_names = ["connection"] @@ -33,7 +37,9 @@ def setup(self, connection): }, index=tm.makeStringIndex(N), ) - self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df.iloc[1000:3000, 1] = np.nan + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") @@ -45,10 +51,18 @@ def time_read_sql_query(self, connection): class WriteSQLDtypes: - params = ( ["sqlalchemy", "sqlite"], - ["float", "float_with_nan", "string", "bool", "int", "datetime"], + [ + "float", + "float_with_nan", + "string", + "bool", + "int", + "date", + "time", + "datetime", + ], ) param_names = ["connection", "dtype"] @@ -72,7 +86,9 @@ def setup(self, connection, dtype): }, index=tm.makeStringIndex(N), ) - self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df.iloc[1000:3000, 1] = np.nan + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") @@ -99,7 +115,9 @@ def setup(self): }, index=tm.makeStringIndex(N), ) - self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df.iloc[1000:3000, 1] = np.nan + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") @@ -116,8 +134,16 @@ def time_read_sql_table_parse_dates(self): class ReadSQLTableDtypes: - - params = ["float", "float_with_nan", "string", "bool", "int", "datetime"] + params = [ + "float", + "float_with_nan", + "string", + "bool", + "int", + "date", + "time", + "datetime", + ] param_names = ["dtype"] def setup(self, dtype): @@ -135,7 +161,9 @@ def setup(self, dtype): }, index=tm.makeStringIndex(N), ) - self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df.iloc[1000:3000, 1] = np.nan + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index 9faafa82ff46e..300b9c778f1f8 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -1,12 +1,18 @@ import numpy as np -from pandas import DataFrame, date_range, read_stata +from pandas import ( + DataFrame, + date_range, + read_stata, +) -from ..pandas_vb_common import BaseIO, tm +from ..pandas_vb_common import ( + BaseIO, + tm, +) class Stata(BaseIO): - params = ["tc", "td", "tm", "tw", "th", "tq", "ty"] param_names = ["convert_dates"] @@ -31,13 +37,13 @@ def setup(self, convert_dates): ) self.df["float32_"] = np.array(np.random.randn(N), dtype=np.float32) self.convert_dates = {"index": convert_dates} - self.df.to_stata(self.fname, self.convert_dates) + self.df.to_stata(self.fname, convert_dates=self.convert_dates) def time_read_stata(self, convert_dates): read_stata(self.fname) def time_write_stata(self, convert_dates): - self.df.to_stata(self.fname, self.convert_dates) + self.df.to_stata(self.fname, convert_dates=self.convert_dates) class StataMissing(Stata): @@ -47,7 +53,7 @@ def setup(self, convert_dates): missing_data = np.random.randn(self.N) missing_data[missing_data < 0] = np.nan self.df[f"missing_{i}"] = missing_data - self.df.to_stata(self.fname, self.convert_dates) + self.df.to_stata(self.fname, convert_dates=self.convert_dates) from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py index 4fc07bbabda06..e7d274b8bd631 100644 --- a/asv_bench/benchmarks/io/style.py +++ b/asv_bench/benchmarks/io/style.py @@ -1,10 +1,12 @@ import numpy as np -from pandas import DataFrame +from pandas import ( + DataFrame, + IndexSlice, +) -class RenderApply: - +class Render: params = [[12, 24, 36], [12, 120]] param_names = ["cols", "rows"] @@ -14,16 +16,46 @@ def setup(self, cols, rows): columns=[f"float_{i+1}" for i in range(cols)], index=[f"row_{i+1}" for i in range(rows)], ) - self._style_apply() - def time_render(self, cols, rows): - self.st.render() + def time_apply_render(self, cols, rows): + self._style_apply() + self.st._render_html(True, True) - def peakmem_apply(self, cols, rows): + def peakmem_apply_render(self, cols, rows): self._style_apply() + self.st._render_html(True, True) + + def time_classes_render(self, cols, rows): + self._style_classes() + self.st._render_html(True, True) + + def peakmem_classes_render(self, cols, rows): + self._style_classes() + self.st._render_html(True, True) + + def time_tooltips_render(self, cols, rows): + self._style_tooltips() + self.st._render_html(True, True) + + def peakmem_tooltips_render(self, cols, rows): + self._style_tooltips() + self.st._render_html(True, True) + + def time_format_render(self, cols, rows): + self._style_format() + self.st._render_html(True, True) - def peakmem_render(self, cols, rows): - self.st.render() + def peakmem_format_render(self, cols, rows): + self._style_format() + self.st._render_html(True, True) + + def time_apply_format_hide_render(self, cols, rows): + self._style_apply_format_hide() + self.st._render_html(True, True) + + def peakmem_apply_format_hide_render(self, cols, rows): + self._style_apply_format_hide() + self.st._render_html(True, True) def _style_apply(self): def _apply_func(s): @@ -32,3 +64,29 @@ def _apply_func(s): ] self.st = self.df.style.apply(_apply_func, axis=1) + + def _style_classes(self): + classes = self.df.applymap(lambda v: ("cls-1" if v > 0 else "")) + classes.index, classes.columns = self.df.index, self.df.columns + self.st = self.df.style.set_td_classes(classes) + + def _style_format(self): + ic = int(len(self.df.columns) / 4 * 3) + ir = int(len(self.df.index) / 4 * 3) + # apply a formatting function + # subset is flexible but hinders vectorised solutions + self.st = self.df.style.format( + "{:,.3f}", subset=IndexSlice["row_1":f"row_{ir}", "float_1":f"float_{ic}"] + ) + + def _style_apply_format_hide(self): + self.st = self.df.style.applymap(lambda v: "color: red;") + self.st.format("{:.3f}") + self.st.hide(self.st.index[1:], axis=0) + self.st.hide(self.st.columns[1:], axis=1) + + def _style_tooltips(self): + ttips = DataFrame("abc", index=self.df.index[::2], columns=self.df.columns[::2]) + self.st = self.df.style.set_tooltips(ttips) + self.st.hide(self.st.index[12:], axis=0) + self.st.hide(self.st.columns[12:], axis=1) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 1333b3a0f0560..eaa51730477cc 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -2,7 +2,17 @@ import numpy as np -from pandas import DataFrame, MultiIndex, Series, concat, date_range, merge, merge_asof +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + array, + concat, + date_range, + merge, + merge_asof, +) from .pandas_vb_common import tm @@ -12,28 +22,7 @@ from pandas import ordered_merge as merge_ordered -class Append: - def setup(self): - self.df1 = DataFrame(np.random.randn(10000, 4), columns=["A", "B", "C", "D"]) - self.df2 = self.df1.copy() - self.df2.index = np.arange(10000, 20000) - self.mdf1 = self.df1.copy() - self.mdf1["obj1"] = "bar" - self.mdf1["obj2"] = "bar" - self.mdf1["int1"] = 5 - self.mdf1 = self.mdf1._consolidate() - self.mdf2 = self.mdf1.copy() - self.mdf2.index = self.df2.index - - def time_append_homogenous(self): - self.df1.append(self.df2) - - def time_append_mixed(self): - self.mdf1.append(self.mdf2) - - class Concat: - params = [0, 1] param_names = ["axis"] @@ -66,7 +55,6 @@ def time_concat_mixed_ndims(self, axis): class ConcatDataFrames: - params = ([0, 1], [True, False]) param_names = ["axis", "ignore_index"] @@ -83,8 +71,46 @@ def time_f_ordered(self, axis, ignore_index): concat(self.frame_f, axis=axis, ignore_index=ignore_index) -class Join: +class ConcatIndexDtype: + params = ( + ["datetime64[ns]", "int64", "Int64", "string[python]", "string[pyarrow]"], + ["monotonic", "non_monotonic", "has_na"], + [0, 1], + [True, False], + ) + param_names = ["dtype", "structure", "axis", "sort"] + + def setup(self, dtype, structure, axis, sort): + N = 10_000 + if dtype == "datetime64[ns]": + vals = date_range("1970-01-01", periods=N) + elif dtype in ("int64", "Int64"): + vals = np.arange(N, dtype=np.int64) + elif dtype in ("string[python]", "string[pyarrow]"): + vals = tm.makeStringIndex(N) + else: + raise NotImplementedError + + idx = Index(vals, dtype=dtype) + + if structure == "monotonic": + idx = idx.sort_values() + elif structure == "non_monotonic": + idx = idx[::-1] + elif structure == "has_na": + if not idx._can_hold_na: + raise NotImplementedError + idx = Index([None], dtype=dtype).append(idx) + else: + raise NotImplementedError + + self.series = [Series(i, idx[:-i]) for i in range(1, 6)] + + def time_concat_series(self, dtype, structure, axis, sort): + concat(self.series, axis=axis, sort=sort) + +class Join: params = [True, False] param_names = ["sort"] @@ -132,6 +158,9 @@ def time_join_dataframe_index_single_key_small(self, sort): def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort): self.df_shuf.join(self.df_key2, on="key2", sort=sort) + def time_join_dataframes_cross(self, sort): + self.df.loc[:2000].join(self.df_key1, how="cross", sort=sort) + class JoinIndex: def setup(self): @@ -147,6 +176,31 @@ def time_left_outer_join_index(self): self.left.join(self.right, on="jim") +class JoinMultiindexSubset: + def setup(self): + N = 100_000 + mi1 = MultiIndex.from_arrays([np.arange(N)] * 4, names=["a", "b", "c", "d"]) + mi2 = MultiIndex.from_arrays([np.arange(N)] * 2, names=["a", "b"]) + self.left = DataFrame({"col1": 1}, index=mi1) + self.right = DataFrame({"col2": 2}, index=mi2) + + def time_join_multiindex_subset(self): + self.left.join(self.right) + + +class JoinEmpty: + def setup(self): + N = 100_000 + self.df = DataFrame({"A": np.arange(N)}) + self.df_empty = DataFrame(columns=["B", "C"], dtype="int64") + + def time_inner_join_left_empty(self): + self.df_empty.join(self.df, how="inner") + + def time_inner_join_right_empty(self): + self.df.join(self.df_empty, how="inner") + + class JoinNonUnique: # outer join of non-unique # GH 6329 @@ -155,7 +209,7 @@ def setup(self): daily_dates = date_index.to_period("D").to_timestamp("S", "S") self.fracofday = date_index.values - daily_dates.values self.fracofday = self.fracofday.astype("timedelta64[ns]") - self.fracofday = self.fracofday.astype(np.float64) / 86400000000000.0 + self.fracofday = self.fracofday.astype(np.float64) / 86_400_000_000_000 self.fracofday = Series(self.fracofday, daily_dates) index = date_range(date_index.min(), date_index.max(), freq="D") self.temp = Series(1.0, index)[self.fracofday.index] @@ -165,7 +219,6 @@ def time_join_non_unique_equal(self): class Merge: - params = [True, False] param_names = ["sort"] @@ -205,14 +258,53 @@ def time_merge_dataframe_integer_2key(self, sort): def time_merge_dataframe_integer_key(self, sort): merge(self.df, self.df2, on="key1", sort=sort) + def time_merge_dataframe_empty_right(self, sort): + merge(self.left, self.right.iloc[:0], sort=sort) -class I8Merge: + def time_merge_dataframe_empty_left(self, sort): + merge(self.left.iloc[:0], self.right, sort=sort) + + def time_merge_dataframes_cross(self, sort): + merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort) + +class MergeEA: + params = [ + "Int64", + "Int32", + "Int16", + "UInt64", + "UInt32", + "UInt16", + "Float64", + "Float32", + ] + param_names = ["dtype"] + + def setup(self, dtype): + N = 10_000 + indices = np.arange(1, N) + key = np.tile(indices[:8000], 10) + self.left = DataFrame( + {"key": Series(key, dtype=dtype), "value": np.random.randn(80000)} + ) + self.right = DataFrame( + { + "key": Series(indices[2000:], dtype=dtype), + "value2": np.random.randn(7999), + } + ) + + def time_merge(self, dtype): + merge(self.left, self.right) + + +class I8Merge: params = ["inner", "outer", "left", "right"] param_names = ["how"] def setup(self, how): - low, high, n = -1000, 1000, 10 ** 6 + low, high, n = -1000, 1000, 10**6 self.left = DataFrame( np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG") ) @@ -248,12 +340,24 @@ def setup(self): Z=self.right_object["Z"].astype("category") ) + self.left_cat_col = self.left_object.astype({"X": "category"}) + self.right_cat_col = self.right_object.astype({"X": "category"}) + + self.left_cat_idx = self.left_cat_col.set_index("X") + self.right_cat_idx = self.right_cat_col.set_index("X") + def time_merge_object(self): merge(self.left_object, self.right_object, on="X") def time_merge_cat(self): merge(self.left_cat, self.right_cat, on="X") + def time_merge_on_cat_col(self): + merge(self.left_cat_col, self.right_cat_col, on="X") + + def time_merge_on_cat_idx(self): + merge(self.left_cat_idx, self.right_cat_idx, on="X") + class MergeOrdered: def setup(self): @@ -366,10 +470,46 @@ def time_multiby(self, direction, tolerance): ) +class MergeMultiIndex: + params = [ + [ + ("int64", "int64"), + ("datetime64[ns]", "int64"), + ("Int64", "Int64"), + ], + ["left", "right", "inner", "outer"], + ] + param_names = ["dtypes", "how"] + + def setup(self, dtypes, how): + n = 100_000 + offset = 50_000 + mi1 = MultiIndex.from_arrays( + [ + array(np.arange(n), dtype=dtypes[0]), + array(np.arange(n), dtype=dtypes[1]), + ] + ) + mi2 = MultiIndex.from_arrays( + [ + array(np.arange(offset, n + offset), dtype=dtypes[0]), + array(np.arange(offset, n + offset), dtype=dtypes[1]), + ] + ) + self.df1 = DataFrame({"col1": 1}, index=mi1) + self.df2 = DataFrame({"col2": 2}, index=mi2) + + def time_merge_sorted_multiindex(self, dtypes, how): + # copy to avoid MultiIndex._values caching + df1 = self.df1.copy() + df2 = self.df2.copy() + merge(df1, df2, how=how, left_index=True, right_index=True) + + class Align: def setup(self): - size = 5 * 10 ** 5 - rng = np.arange(0, 10 ** 13, 10 ** 7) + size = 5 * 10**5 + rng = np.arange(0, 10**13, 10**7) stamps = np.datetime64("now").view("i8") + rng idx1 = np.sort(np.random.choice(stamps, size, replace=False)) idx2 = np.sort(np.random.choice(stamps, size, replace=False)) diff --git a/asv_bench/benchmarks/libs.py b/asv_bench/benchmarks/libs.py new file mode 100644 index 0000000000000..f041499c9c622 --- /dev/null +++ b/asv_bench/benchmarks/libs.py @@ -0,0 +1,106 @@ +""" +Benchmarks for code in pandas/_libs, excluding pandas/_libs/tslibs, +which has its own directory. + +If a PR does not edit anything in _libs/, then it is unlikely that the +benchmarks will be affected. +""" +import numpy as np + +from pandas._libs.lib import ( + infer_dtype, + is_list_like, + is_scalar, +) + +from pandas import ( + NA, + NaT, +) + +from .pandas_vb_common import ( + lib, + tm, +) + +try: + from pandas.util import cache_readonly +except ImportError: + from pandas.util.decorators import cache_readonly + + +# TODO: share with something in pd._testing? +scalars = [ + 0, + 1.0, + 1 + 2j, + True, + "foo", + b"bar", + None, + np.datetime64(123, "ns"), + np.timedelta64(123, "ns"), + NaT, + NA, +] +zero_dims = [np.array("123")] +listlikes = [np.array([1, 2, 3]), {0: 1}, {1, 2, 3}, [1, 2, 3], (1, 2, 3)] + + +class ScalarListLike: + params = scalars + zero_dims + listlikes + + def time_is_list_like(self, param): + is_list_like(param) + + def time_is_scalar(self, param): + is_scalar(param) + + +class FastZip: + def setup(self): + N = 10000 + K = 10 + key1 = tm.makeStringIndex(N).values.repeat(K) + key2 = tm.makeStringIndex(N).values.repeat(K) + col_array = np.vstack([key1, key2, np.random.randn(N * K)]) + col_array2 = col_array.copy() + col_array2[:, :10000] = np.nan + self.col_array_list = list(col_array) + + def time_lib_fast_zip(self): + lib.fast_zip(self.col_array_list) + + +class InferDtype: + param_names = ["dtype"] + data_dict = { + "np-object": np.array([1] * 100000, dtype="O"), + "py-object": [1] * 100000, + "np-null": np.array([1] * 50000 + [np.nan] * 50000), + "py-null": [1] * 50000 + [None] * 50000, + "np-int": np.array([1] * 100000, dtype=int), + "np-floating": np.array([1.0] * 100000, dtype=float), + "empty": [], + "bytes": [b"a"] * 100000, + } + params = list(data_dict.keys()) + + def time_infer_dtype_skipna(self, dtype): + infer_dtype(self.data_dict[dtype], skipna=True) + + def time_infer_dtype(self, dtype): + infer_dtype(self.data_dict[dtype], skipna=False) + + +class CacheReadonly: + def setup(self): + class Foo: + @cache_readonly + def prop(self): + return 5 + + self.obj = Foo() + + def time_cache_readonly(self): + self.obj.prop diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 18dbb7eae0615..9c997b5386eaa 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -2,7 +2,15 @@ import numpy as np -from pandas import DataFrame, MultiIndex, RangeIndex, date_range +from pandas import ( + NA, + DataFrame, + MultiIndex, + RangeIndex, + Series, + array, + date_range, +) from .pandas_vb_common import tm @@ -42,6 +50,29 @@ def time_small_get_loc_warm(self): self.mi_small.get_loc((99, "A", "A")) +class GetLocs: + def setup(self): + self.mi_large = MultiIndex.from_product( + [np.arange(1000), np.arange(20), list(string.ascii_letters)], + names=["one", "two", "three"], + ) + self.mi_med = MultiIndex.from_product( + [np.arange(1000), np.arange(10), list("A")], names=["one", "two", "three"] + ) + self.mi_small = MultiIndex.from_product( + [np.arange(100), list("A"), list("A")], names=["one", "two", "three"] + ) + + def time_large_get_locs(self): + self.mi_large.get_locs([999, 19, "Z"]) + + def time_med_get_locs(self): + self.mi_med.get_locs([999, 9, "A"]) + + def time_small_get_locs(self): + self.mi_small.get_locs([99, "A", "A"]) + + class Duplicates: def setup(self): size = 65536 @@ -107,7 +138,7 @@ def time_get_indexer_and_pad(self): self.mi_int.get_indexer(self.other_mi_many_mismatches, method="pad") def time_is_monotonic(self): - self.mi_int.is_monotonic + self.mi_int.is_monotonic_increasing class Duplicated: @@ -146,9 +177,21 @@ def time_sortlevel_one(self): self.mi.sortlevel(1) +class SortValues: + params = ["int64", "Int64"] + param_names = ["dtype"] + + def setup(self, dtype): + a = array(np.tile(np.arange(100), 1000), dtype=dtype) + b = array(np.tile(np.arange(1000), 100), dtype=dtype) + self.mi = MultiIndex.from_arrays([a, b]) + + def time_sort_values(self, dtype): + self.mi.sort_values() + + class Values: def setup_cache(self): - level1 = range(1000) level2 = date_range(start="1/1/2012", periods=100) mi = MultiIndex.from_product([level1, level2]) @@ -163,7 +206,6 @@ def time_datetime_level_values_sliced(self, mi): class CategoricalLevel: def setup(self): - self.df = DataFrame( { "a": np.arange(1_000_000, dtype=np.int32), @@ -189,16 +231,16 @@ def time_equals_non_object_index(self): class SetOperations: - params = [ ("monotonic", "non_monotonic"), - ("datetime", "int", "string"), + ("datetime", "int", "string", "ea_int"), ("intersection", "union", "symmetric_difference"), + (False, None), ] - param_names = ["index_structure", "dtype", "method"] + param_names = ["index_structure", "dtype", "method", "sort"] - def setup(self, index_structure, dtype, method): - N = 10 ** 5 + def setup(self, index_structure, dtype, method, sort): + N = 10**5 level1 = range(1000) level2 = date_range(start="1/1/2000", periods=N // 1000) @@ -210,10 +252,14 @@ def setup(self, index_structure, dtype, method): level2 = tm.makeStringIndex(N // 1000).values str_left = MultiIndex.from_product([level1, level2]) + level2 = range(N // 1000) + ea_int_left = MultiIndex.from_product([level1, Series(level2, dtype="Int64")]) + data = { "datetime": dates_left, "int": int_left, "string": str_left, + "ea_int": ea_int_left, } if index_structure == "non_monotonic": @@ -223,8 +269,131 @@ def setup(self, index_structure, dtype, method): self.left = data[dtype]["left"] self.right = data[dtype]["right"] - def time_operation(self, index_structure, dtype, method): - getattr(self.left, method)(self.right) + def time_operation(self, index_structure, dtype, method, sort): + getattr(self.left, method)(self.right, sort=sort) + + +class Difference: + params = [ + ("datetime", "int", "string", "ea_int"), + ] + param_names = ["dtype"] + + def setup(self, dtype): + N = 10**4 * 2 + level1 = range(1000) + + level2 = date_range(start="1/1/2000", periods=N // 1000) + dates_left = MultiIndex.from_product([level1, level2]) + + level2 = range(N // 1000) + int_left = MultiIndex.from_product([level1, level2]) + + level2 = Series(range(N // 1000), dtype="Int64") + level2[0] = NA + ea_int_left = MultiIndex.from_product([level1, level2]) + + level2 = tm.makeStringIndex(N // 1000).values + str_left = MultiIndex.from_product([level1, level2]) + + data = { + "datetime": dates_left, + "int": int_left, + "ea_int": ea_int_left, + "string": str_left, + } + + data = {k: {"left": mi, "right": mi[:5]} for k, mi in data.items()} + self.left = data[dtype]["left"] + self.right = data[dtype]["right"] + + def time_difference(self, dtype): + self.left.difference(self.right) + + +class Unique: + params = [ + (("Int64", NA), ("int64", 0)), + ] + param_names = ["dtype_val"] + + def setup(self, dtype_val): + level = Series( + [1, 2, dtype_val[1], dtype_val[1]] + list(range(1_000_000)), + dtype=dtype_val[0], + ) + self.midx = MultiIndex.from_arrays([level, level]) + + level_dups = Series( + [1, 2, dtype_val[1], dtype_val[1]] + list(range(500_000)) * 2, + dtype=dtype_val[0], + ) + + self.midx_dups = MultiIndex.from_arrays([level_dups, level_dups]) + + def time_unique(self, dtype_val): + self.midx.unique() + + def time_unique_dups(self, dtype_val): + self.midx_dups.unique() + + +class Isin: + params = [ + ("string", "int", "datetime"), + ] + param_names = ["dtype"] + + def setup(self, dtype): + N = 10**5 + level1 = range(1000) + + level2 = date_range(start="1/1/2000", periods=N // 1000) + dates_midx = MultiIndex.from_product([level1, level2]) + + level2 = range(N // 1000) + int_midx = MultiIndex.from_product([level1, level2]) + + level2 = tm.makeStringIndex(N // 1000).values + str_midx = MultiIndex.from_product([level1, level2]) + + data = { + "datetime": dates_midx, + "int": int_midx, + "string": str_midx, + } + + self.midx = data[dtype] + self.values_small = self.midx[:100] + self.values_large = self.midx[100:] + + def time_isin_small(self, dtype): + self.midx.isin(self.values_small) + + def time_isin_large(self, dtype): + self.midx.isin(self.values_large) + + +class Putmask: + def setup(self): + N = 10**5 + level1 = range(1_000) + + level2 = date_range(start="1/1/2000", periods=N // 1000) + self.midx = MultiIndex.from_product([level1, level2]) + + level1 = range(1_000, 2_000) + self.midx_values = MultiIndex.from_product([level1, level2]) + + level2 = date_range(start="1/1/2010", periods=N // 1000) + self.midx_values_different = MultiIndex.from_product([level1, level2]) + self.mask = np.array([True, False] * (N // 2)) + + def time_putmask(self): + self.midx.putmask(self.mask, self.midx_values) + + def time_putmask_all_different(self): + self.midx.putmask(self.mask, self.midx_values_different) from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/package.py b/asv_bench/benchmarks/package.py index 8ca33db361fa0..257c82cba8878 100644 --- a/asv_bench/benchmarks/package.py +++ b/asv_bench/benchmarks/package.py @@ -4,22 +4,16 @@ import subprocess import sys -from pandas.compat import PY37 - class TimeImport: def time_import(self): - if PY37: - # on py37+ we the "-X importtime" usage gives us a more precise - # measurement of the import time we actually care about, - # without the subprocess or interpreter overhead - cmd = [sys.executable, "-X", "importtime", "-c", "import pandas as pd"] - p = subprocess.run(cmd, stderr=subprocess.PIPE) - - line = p.stderr.splitlines()[-1] - field = line.split(b"|")[-2].strip() - total = int(field) # microseconds - return total + # on py37+ we the "-X importtime" usage gives us a more precise + # measurement of the import time we actually care about, + # without the subprocess or interpreter overhead + cmd = [sys.executable, "-X", "importtime", "-c", "import pandas as pd"] + p = subprocess.run(cmd, stderr=subprocess.PIPE, check=True) - cmd = [sys.executable, "-c", "import pandas as pd"] - subprocess.run(cmd, stderr=subprocess.PIPE) + line = p.stderr.splitlines()[-1] + field = line.split(b"|")[-2].strip() + total = int(field) # microseconds + return total diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 23286343d7367..97d91111e833a 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -15,9 +15,9 @@ # Compatibility import for the testing module try: - import pandas._testing as tm # noqa + import pandas._testing as tm except ImportError: - import pandas.util.testing as tm # noqa + import pandas.util.testing as tm # noqa:F401 numeric_dtypes = [ diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index e15d4c66e4fc0..501fe198d41d8 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -2,13 +2,19 @@ Period benchmarks with non-tslibs dependencies. See benchmarks.tslibs.period for benchmarks that rely only on tslibs. """ -from pandas import DataFrame, Period, PeriodIndex, Series, date_range, period_range +from pandas import ( + DataFrame, + Period, + PeriodIndex, + Series, + date_range, + period_range, +) from pandas.tseries.frequencies import to_offset class PeriodIndexConstructor: - params = [["D"], [True, False]] param_names = ["freq", "is_offset"] @@ -52,7 +58,6 @@ def time_set_index(self): class Algorithms: - params = ["index", "series"] param_names = ["typ"] @@ -86,7 +91,7 @@ def time_get_loc(self): self.index.get_loc(self.period) def time_shallow_copy(self): - self.index._shallow_copy() + self.index._view() def time_series_loc(self): self.series.loc[self.period] diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index 5c718516360ed..789bb8d8533b1 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -1,13 +1,29 @@ +import contextlib +import importlib.machinery +import importlib.util +import os +import pathlib +import sys +import tempfile +from unittest import mock + import matplotlib import numpy as np -from pandas import DataFrame, DatetimeIndex, Series, date_range +from pandas import ( + DataFrame, + DatetimeIndex, + Series, + date_range, +) try: from pandas.plotting import andrews_curves except ImportError: from pandas.tools.plotting import andrews_curves +from pandas.plotting._core import _get_plot_backend + matplotlib.use("Agg") @@ -94,4 +110,55 @@ def time_plot_andrews_curves(self): andrews_curves(self.df, "Name") +class BackendLoading: + repeat = 1 + number = 1 + warmup_time = 0 + + def setup(self): + mod = importlib.util.module_from_spec( + importlib.machinery.ModuleSpec("pandas_dummy_backend", None) + ) + mod.plot = lambda *args, **kwargs: 1 + + with contextlib.ExitStack() as stack: + stack.enter_context( + mock.patch.dict(sys.modules, {"pandas_dummy_backend": mod}) + ) + tmp_path = pathlib.Path(stack.enter_context(tempfile.TemporaryDirectory())) + + sys.path.insert(0, os.fsdecode(tmp_path)) + stack.callback(sys.path.remove, os.fsdecode(tmp_path)) + + dist_info = tmp_path / "my_backend-0.0.0.dist-info" + dist_info.mkdir() + (dist_info / "entry_points.txt").write_bytes( + b"[pandas_plotting_backends]\n" + b"my_ep_backend = pandas_dummy_backend\n" + b"my_ep_backend0 = pandas_dummy_backend\n" + b"my_ep_backend1 = pandas_dummy_backend\n" + b"my_ep_backend2 = pandas_dummy_backend\n" + b"my_ep_backend3 = pandas_dummy_backend\n" + b"my_ep_backend4 = pandas_dummy_backend\n" + b"my_ep_backend5 = pandas_dummy_backend\n" + b"my_ep_backend6 = pandas_dummy_backend\n" + b"my_ep_backend7 = pandas_dummy_backend\n" + b"my_ep_backend8 = pandas_dummy_backend\n" + b"my_ep_backend9 = pandas_dummy_backend\n" + ) + self.stack = stack.pop_all() + + def teardown(self): + self.stack.close() + + def time_get_plot_backend(self): + # finds the first my_ep_backend + _get_plot_backend("my_ep_backend") + + def time_get_plot_backend_fallback(self): + # iterates through all the my_ep_backend[0-9] before falling back + # to importlib.import_module + _get_plot_backend("pandas_dummy_backend") + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 03394e6fe08cb..eac4bb38eb18f 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -1,8 +1,15 @@ import numpy as np -from pandas import DataFrame, Index, MultiIndex, Series, date_range, period_range +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + date_range, + period_range, +) -from .pandas_vb_common import lib, tm +from .pandas_vb_common import tm class Reindex: @@ -21,6 +28,11 @@ def setup(self): index = MultiIndex.from_arrays([level1, level2]) self.s = Series(np.random.randn(N * K), index=index) self.s_subset = self.s[::2] + self.s_subset_no_cache = self.s[::2].copy() + + mi = MultiIndex.from_product([rng, range(100)]) + self.s2 = Series(np.random.randn(len(mi)), index=mi) + self.s2_subset = self.s2[::2].copy() def time_reindex_dates(self): self.df.reindex(self.rng_subset) @@ -28,12 +40,20 @@ def time_reindex_dates(self): def time_reindex_columns(self): self.df2.reindex(columns=self.df.columns[1:5]) - def time_reindex_multiindex(self): + def time_reindex_multiindex_with_cache(self): + # MultiIndex._values gets cached self.s.reindex(self.s_subset.index) + def time_reindex_multiindex_no_cache(self): + # Copy to avoid MultiIndex._values getting cached + self.s.reindex(self.s_subset_no_cache.index.copy()) -class ReindexMethod: + def time_reindex_multiindex_no_cache_dates(self): + # Copy to avoid MultiIndex._values getting cached + self.s2_subset.reindex(self.s2.index.copy()) + +class ReindexMethod: params = [["pad", "backfill"], [date_range, period_range]] param_names = ["method", "constructor"] @@ -47,7 +67,6 @@ def time_reindex_method(self, method, constructor): class Fillna: - params = ["pad", "backfill"] param_names = ["method"] @@ -86,7 +105,6 @@ def time_reindex_level(self): class DropDuplicates: - params = [True, False] param_names = ["inplace"] @@ -145,19 +163,4 @@ def time_align_series_irregular_string(self): self.x + self.y -class LibFastZip: - def setup(self): - N = 10000 - K = 10 - key1 = tm.makeStringIndex(N).values.repeat(K) - key2 = tm.makeStringIndex(N).values.repeat(K) - col_array = np.vstack([key1, key2, np.random.randn(N * K)]) - col_array2 = col_array.copy() - col_array2[:, :10000] = np.nan - self.col_array_list = list(col_array) - - def time_lib_fast_zip(self): - lib.fast_zip(self.col_array_list) - - from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 2a115fb0b4fe3..36b5b54e4440b 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -4,12 +4,11 @@ class FillNa: - params = [True, False] param_names = ["inplace"] def setup(self, inplace): - N = 10 ** 6 + N = 10**6 rng = pd.date_range("1/1/2000", periods=N, freq="min") data = np.random.randn(N) data[::2] = np.nan @@ -23,15 +22,14 @@ def time_replace(self, inplace): class ReplaceDict: - params = [True, False] param_names = ["inplace"] def setup(self, inplace): - N = 10 ** 5 - start_value = 10 ** 5 + N = 10**5 + start_value = 10**5 self.to_rep = dict(enumerate(np.arange(N) + start_value)) - self.s = pd.Series(np.random.randint(N, size=10 ** 3)) + self.s = pd.Series(np.random.randint(N, size=10**3)) def time_replace_series(self, inplace): self.s.replace(self.to_rep, inplace=inplace) @@ -44,23 +42,22 @@ class ReplaceList: param_names = ["inplace"] def setup(self, inplace): - self.df = pd.DataFrame({"A": 0, "B": 0}, index=range(4 * 10 ** 7)) + self.df = pd.DataFrame({"A": 0, "B": 0}, index=range(4 * 10**7)) def time_replace_list(self, inplace): self.df.replace([np.inf, -np.inf], np.nan, inplace=inplace) def time_replace_list_one_match(self, inplace): - # the 1 can be held in self._df.blocks[0], while the inf and -inf cant + # the 1 can be held in self._df.blocks[0], while the inf and -inf can't self.df.replace([np.inf, -np.inf, 1], np.nan, inplace=inplace) class Convert: - params = (["DataFrame", "Series"], ["Timestamp", "Timedelta"]) param_names = ["constructor", "replace_data"] def setup(self, constructor, replace_data): - N = 10 ** 3 + N = 10**3 data = { "Series": pd.Series(np.random.randint(N, size=N)), "DataFrame": pd.DataFrame( diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 21081ee23a773..551af7ccb40bc 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -4,16 +4,28 @@ import numpy as np import pandas as pd -from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long +from pandas import ( + DataFrame, + MultiIndex, + date_range, + melt, + wide_to_long, +) +from pandas.api.types import CategoricalDtype class Melt: - def setup(self): - self.df = DataFrame(np.random.randn(10000, 3), columns=["A", "B", "C"]) - self.df["id1"] = np.random.randint(0, 10, 10000) - self.df["id2"] = np.random.randint(100, 1000, 10000) + params = ["float64", "Float64"] + param_names = ["dtype"] + + def setup(self, dtype): + self.df = DataFrame( + np.random.randn(100_000, 3), columns=["A", "B", "C"], dtype=dtype + ) + self.df["id1"] = pd.Series(np.random.randint(0, 10, 10000)) + self.df["id2"] = pd.Series(np.random.randint(100, 1000, 10000)) - def time_melt_dataframe(self): + def time_melt_dataframe(self, dtype): melt(self.df, id_vars=["id1", "id2"]) @@ -29,7 +41,7 @@ def setup(self): self.df = DataFrame(data) def time_reshape_pivot_time_series(self): - self.df.pivot("date", "variable", "value") + self.df.pivot(index="date", columns="variable", values="value") class SimpleReshape: @@ -46,8 +58,42 @@ def time_unstack(self): self.df.unstack(1) -class Unstack: +class ReshapeExtensionDtype: + params = ["datetime64[ns, US/Pacific]", "Period[s]"] + param_names = ["dtype"] + + def setup(self, dtype): + lev = pd.Index(list("ABCDEFGHIJ")) + ri = pd.Index(range(1000)) + mi = MultiIndex.from_product([lev, ri], names=["foo", "bar"]) + + index = date_range("2016-01-01", periods=10000, freq="s", tz="US/Pacific") + if dtype == "Period[s]": + index = index.tz_localize(None).to_period("s") + + ser = pd.Series(index, index=mi) + df = ser.unstack("bar") + # roundtrips -> df.stack().equals(ser) + self.ser = ser + self.df = df + + def time_stack(self, dtype): + self.df.stack() + + def time_unstack_fast(self, dtype): + # last level -> doesn't have to make copies + self.ser.unstack("bar") + + def time_unstack_slow(self, dtype): + # first level -> must make copies + self.ser.unstack("foo") + + def time_transpose(self, dtype): + self.df.T + + +class Unstack: params = ["int", "category"] def setup(self, dtype): @@ -59,6 +105,7 @@ def setup(self, dtype): columns = np.arange(n) if dtype == "int": values = np.arange(m * m * n).reshape(m * m, n) + self.df = DataFrame(values, index, columns) else: # the category branch is ~20x slower than int. So we # cut down the size a bit. Now it's only ~3x slower. @@ -68,7 +115,8 @@ def setup(self, dtype): values = np.take(list(string.ascii_letters), indices) values = [pd.Categorical(v) for v in values.T] - self.df = DataFrame(values, index, columns) + self.df = DataFrame(dict(enumerate(values)), index, columns) + self.df2 = self.df.iloc[:-1] def time_full_product(self, dtype): @@ -103,7 +151,10 @@ def setup(self): nidvars = 20 N = 5000 self.letters = list("ABCD") - yrvars = [l + str(num) for l, num in product(self.letters, range(1, nyrs + 1))] + yrvars = [ + letter + str(num) + for letter, num in product(self.letters, range(1, nyrs + 1)) + ] columns = [str(i) for i in range(nidvars)] + yrvars self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), columns=columns) self.df["id"] = self.df.index @@ -162,7 +213,7 @@ def time_pivot_table_categorical_observed(self): ) def time_pivot_table_margins_only_column(self): - self.df.pivot_table(columns=["key2", "key3"], margins=True) + self.df.pivot_table(columns=["key1", "key2", "key3"], margins=True) class Crosstab: @@ -193,7 +244,7 @@ def setup(self): categories = list(string.ascii_letters[:12]) s = pd.Series( np.random.choice(categories, size=1000000), - dtype=pd.api.types.CategoricalDtype(categories), + dtype=CategoricalDtype(categories), ) self.s = s @@ -209,7 +260,7 @@ class Cut: param_names = ["bins"] def setup(self, bins): - N = 10 ** 5 + N = 10**5 self.int_series = pd.Series(np.arange(N).repeat(5)) self.float_series = pd.Series(np.random.randn(N).repeat(5)) self.timedelta_series = pd.Series( @@ -258,7 +309,6 @@ class Explode: params = [[100, 1000, 10000], [3, 5, 10]] def setup(self, n_rows, max_list_length): - data = [np.arange(np.random.randint(max_list_length)) for _ in range(n_rows)] self.series = pd.Series(data) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index f0dd908f81043..bd4da00bfd2ad 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -1,28 +1,31 @@ +import warnings + import numpy as np import pandas as pd class Methods: - params = ( ["DataFrame", "Series"], - [10, 1000], + [("rolling", {"window": 10}), ("rolling", {"window": 1000}), ("expanding", {})], ["int", "float"], - ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], + ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum", "sem"], ) - param_names = ["constructor", "window", "dtype", "method"] + param_names = ["constructor", "window_kwargs", "dtype", "method"] - def setup(self, constructor, window, dtype, method): - N = 10 ** 5 + def setup(self, constructor, window_kwargs, dtype, method): + N = 10**5 + window, kwargs = window_kwargs arr = (100 * np.random.random(N)).astype(dtype) - self.roll = getattr(pd, constructor)(arr).rolling(window) + obj = getattr(pd, constructor)(arr) + self.window = getattr(obj, window)(**kwargs) - def time_rolling(self, constructor, window, dtype, method): - getattr(self.roll, method)() + def time_method(self, constructor, window_kwargs, dtype, method): + getattr(self.window, method)() - def peakmem_rolling(self, constructor, window, dtype, method): - getattr(self.roll, method)() + def peakmem_method(self, constructor, window_kwargs, dtype, method): + getattr(self.window, method)() class Apply: @@ -36,7 +39,7 @@ class Apply: param_names = ["constructor", "window", "dtype", "function", "raw"] def setup(self, constructor, window, dtype, function, raw): - N = 10 ** 3 + N = 10**3 arr = (100 * np.random.random(N)).astype(dtype) self.roll = getattr(pd, constructor)(arr).rolling(window) @@ -44,64 +47,115 @@ def time_rolling(self, constructor, window, dtype, function, raw): self.roll.apply(function, raw=raw) -class Engine: +class NumbaEngineMethods: params = ( ["DataFrame", "Series"], ["int", "float"], - [np.sum, lambda x: np.sum(x) + 5], - ["cython", "numba"], + [("rolling", {"window": 10}), ("expanding", {})], + ["sum", "max", "min", "median", "mean", "var", "std"], + [True, False], + [None, 100], ) - param_names = ["constructor", "dtype", "function", "engine"] - - def setup(self, constructor, dtype, function, engine): - N = 10 ** 3 - arr = (100 * np.random.random(N)).astype(dtype) - self.data = getattr(pd, constructor)(arr) - - def time_rolling_apply(self, constructor, dtype, function, engine): - self.data.rolling(10).apply(function, raw=True, engine=engine) - - def time_expanding_apply(self, constructor, dtype, function, engine): - self.data.expanding().apply(function, raw=True, engine=engine) - - -class ExpandingMethods: - + param_names = [ + "constructor", + "dtype", + "window_kwargs", + "method", + "parallel", + "cols", + ] + + def setup(self, constructor, dtype, window_kwargs, method, parallel, cols): + N = 10**3 + window, kwargs = window_kwargs + shape = (N, cols) if cols is not None and constructor != "Series" else N + arr = (100 * np.random.random(shape)).astype(dtype) + data = getattr(pd, constructor)(arr) + + # Warm the cache + with warnings.catch_warnings(record=True): + # Catch parallel=True not being applicable e.g. 1D data + self.window = getattr(data, window)(**kwargs) + getattr(self.window, method)( + engine="numba", engine_kwargs={"parallel": parallel} + ) + + def test_method(self, constructor, dtype, window_kwargs, method, parallel, cols): + with warnings.catch_warnings(record=True): + getattr(self.window, method)( + engine="numba", engine_kwargs={"parallel": parallel} + ) + + +class NumbaEngineApply: params = ( ["DataFrame", "Series"], ["int", "float"], - ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], + [("rolling", {"window": 10}), ("expanding", {})], + [np.sum, lambda x: np.sum(x) + 5], + [True, False], + [None, 100], ) - param_names = ["constructor", "window", "dtype", "method"] - - def setup(self, constructor, dtype, method): - N = 10 ** 5 - arr = (100 * np.random.random(N)).astype(dtype) - self.expanding = getattr(pd, constructor)(arr).expanding() - - def time_expanding(self, constructor, dtype, method): - getattr(self.expanding, method)() + param_names = [ + "constructor", + "dtype", + "window_kwargs", + "function", + "parallel", + "cols", + ] + + def setup(self, constructor, dtype, window_kwargs, function, parallel, cols): + N = 10**3 + window, kwargs = window_kwargs + shape = (N, cols) if cols is not None and constructor != "Series" else N + arr = (100 * np.random.random(shape)).astype(dtype) + data = getattr(pd, constructor)(arr) + + # Warm the cache + with warnings.catch_warnings(record=True): + # Catch parallel=True not being applicable e.g. 1D data + self.window = getattr(data, window)(**kwargs) + self.window.apply( + function, raw=True, engine="numba", engine_kwargs={"parallel": parallel} + ) + + def test_method(self, constructor, dtype, window_kwargs, function, parallel, cols): + with warnings.catch_warnings(record=True): + self.window.apply( + function, raw=True, engine="numba", engine_kwargs={"parallel": parallel} + ) class EWMMethods: + params = ( + ["DataFrame", "Series"], + [ + ({"halflife": 10}, "mean"), + ({"halflife": 10}, "std"), + ({"halflife": 1000}, "mean"), + ({"halflife": 1000}, "std"), + ( + { + "halflife": "1 Day", + "times": pd.date_range("1900", periods=10**5, freq="23s"), + }, + "mean", + ), + ], + ["int", "float"], + ) + param_names = ["constructor", "kwargs_method", "dtype"] - params = (["DataFrame", "Series"], [10, 1000], ["int", "float"], ["mean", "std"]) - param_names = ["constructor", "window", "dtype", "method"] - - def setup(self, constructor, window, dtype, method): - N = 10 ** 5 + def setup(self, constructor, kwargs_method, dtype): + N = 10**5 + kwargs, method = kwargs_method arr = (100 * np.random.random(N)).astype(dtype) - times = pd.date_range("1900", periods=N, freq="23s") - self.ewm = getattr(pd, constructor)(arr).ewm(halflife=window) - self.ewm_times = getattr(pd, constructor)(arr).ewm( - halflife="1 Day", times=times - ) - - def time_ewm(self, constructor, window, dtype, method): - getattr(self.ewm, method)() + self.method = method + self.ewm = getattr(pd, constructor)(arr).ewm(**kwargs) - def time_ewm_times(self, constructor, window, dtype, method): - self.ewm.mean() + def time_ewm(self, constructor, kwargs_method, dtype): + getattr(self.ewm, self.method)() class VariableWindowMethods(Methods): @@ -109,33 +163,42 @@ class VariableWindowMethods(Methods): ["DataFrame", "Series"], ["50s", "1h", "1d"], ["int", "float"], - ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], + ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum", "sem"], ) param_names = ["constructor", "window", "dtype", "method"] def setup(self, constructor, window, dtype, method): - N = 10 ** 5 + N = 10**5 arr = (100 * np.random.random(N)).astype(dtype) index = pd.date_range("2017-01-01", periods=N, freq="5s") - self.roll = getattr(pd, constructor)(arr, index=index).rolling(window) + self.window = getattr(pd, constructor)(arr, index=index).rolling(window) class Pairwise: + params = ( + [({"window": 10}, "rolling"), ({"window": 1000}, "rolling"), ({}, "expanding")], + ["corr", "cov"], + [True, False], + ) + param_names = ["window_kwargs", "method", "pairwise"] - params = ([10, 1000, None], ["corr", "cov"], [True, False]) - param_names = ["window", "method", "pairwise"] - - def setup(self, window, method, pairwise): - N = 10 ** 4 + def setup(self, kwargs_window, method, pairwise): + N = 10**4 + n_groups = 20 + kwargs, window = kwargs_window + groups = [i for _ in range(N // n_groups) for i in range(n_groups)] arr = np.random.random(N) self.df = pd.DataFrame(arr) + self.window = getattr(self.df, window)(**kwargs) + self.window_group = getattr( + pd.DataFrame({"A": groups, "B": arr}).groupby("A"), window + )(**kwargs) - def time_pairwise(self, window, method, pairwise): - if window is None: - r = self.df.expanding() - else: - r = self.df.rolling(window=window) - getattr(r, method)(self.df, pairwise=pairwise) + def time_pairwise(self, kwargs_window, method, pairwise): + getattr(self.window, method)(self.df, pairwise=pairwise) + + def time_groupby(self, kwargs_window, method, pairwise): + getattr(self.window_group, method)(self.df, pairwise=pairwise) class Quantile: @@ -149,7 +212,7 @@ class Quantile: param_names = ["constructor", "window", "dtype", "percentile"] def setup(self, constructor, window, dtype, percentile, interpolation): - N = 10 ** 5 + N = 10**5 arr = np.random.random(N).astype(dtype) self.roll = getattr(pd, constructor)(arr).rolling(window) @@ -157,12 +220,38 @@ def time_quantile(self, constructor, window, dtype, percentile, interpolation): self.roll.quantile(percentile, interpolation=interpolation) -class PeakMemFixedWindowMinMax: +class Rank: + params = ( + ["DataFrame", "Series"], + [10, 1000], + ["int", "float"], + [True, False], + [True, False], + ["min", "max", "average"], + ) + param_names = [ + "constructor", + "window", + "dtype", + "percentile", + "ascending", + "method", + ] + + def setup(self, constructor, window, dtype, percentile, ascending, method): + N = 10**5 + arr = np.random.random(N).astype(dtype) + self.roll = getattr(pd, constructor)(arr).rolling(window) + + def time_rank(self, constructor, window, dtype, percentile, ascending, method): + self.roll.rank(pct=percentile, ascending=ascending, method=method) + +class PeakMemFixedWindowMinMax: params = ["min", "max"] def setup(self, operation): - N = int(1e6) + N = 10**6 arr = np.random.random(N) self.roll = pd.Series(arr).rolling(2) @@ -181,7 +270,7 @@ class ForwardWindowMethods: param_names = ["constructor", "window_size", "dtype", "method"] def setup(self, constructor, window_size, dtype, method): - N = 10 ** 5 + N = 10**5 arr = np.random.random(N).astype(dtype) indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=window_size) self.roll = getattr(pd, constructor)(arr).rolling(window=indexer) @@ -194,26 +283,88 @@ def peakmem_rolling(self, constructor, window_size, dtype, method): class Groupby: + params = ( + ["sum", "median", "mean", "max", "min", "kurt", "sum"], + [ + ("rolling", {"window": 2}), + ("rolling", {"window": "30s"}), + ("expanding", {}), + ], + ) - params = ["sum", "median", "mean", "max", "min", "kurt", "sum"] - - def setup(self, method): + def setup(self, method, window_kwargs): N = 1000 + window, kwargs = window_kwargs df = pd.DataFrame( { "A": [str(i) for i in range(N)] * 10, "B": list(range(N)) * 10, - "C": pd.date_range(start="1900-01-01", freq="1min", periods=N * 10), } ) - self.groupby_roll_int = df.groupby("A").rolling(window=2) - self.groupby_roll_offset = df.groupby("A").rolling(window="30s", on="C") + if isinstance(kwargs.get("window", None), str): + df.index = pd.date_range(start="1900-01-01", freq="1min", periods=N * 10) + self.groupby_window = getattr(df.groupby("A"), window)(**kwargs) + + def time_method(self, method, window_kwargs): + getattr(self.groupby_window, method)() + + +class GroupbyLargeGroups: + # https://github.com/pandas-dev/pandas/issues/38038 + # specific example where the rolling operation on a larger dataframe + # is relatively cheap (few but large groups), but creation of + # MultiIndex of result can be expensive - def time_rolling_int(self, method): - getattr(self.groupby_roll_int, method)() + def setup(self): + N = 100000 + self.df = pd.DataFrame({"A": [1, 2] * (N // 2), "B": np.random.randn(N)}) + + def time_rolling_multiindex_creation(self): + self.df.groupby("A").rolling(3).mean() + + +class GroupbyEWM: + params = ["var", "std", "cov", "corr"] + param_names = ["method"] + + def setup(self, method): + df = pd.DataFrame({"A": range(50), "B": range(50)}) + self.gb_ewm = df.groupby("A").ewm(com=1.0) + + def time_groupby_method(self, method): + getattr(self.gb_ewm, method)() + + +class GroupbyEWMEngine: + params = ["cython", "numba"] + param_names = ["engine"] + + def setup(self, engine): + df = pd.DataFrame({"A": range(50), "B": range(50)}) + self.gb_ewm = df.groupby("A").ewm(com=1.0) + + def time_groupby_mean(self, engine): + self.gb_ewm.mean(engine=engine) + + +def table_method_func(x): + return np.sum(x, axis=0) + 1 + + +class TableMethod: + params = ["single", "table"] + param_names = ["method"] + + def setup(self, method): + self.df = pd.DataFrame(np.random.randn(10, 1000)) + + def time_apply(self, method): + self.df.rolling(2, method=method).apply( + table_method_func, raw=True, engine="numba" + ) - def time_rolling_offset(self, method): - getattr(self.groupby_roll_offset, method)() + def time_ewm_mean(self, method): + self.df.ewm(1, method=method).mean(engine="numba") from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 258c29c145721..204393cbb76f2 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -2,96 +2,50 @@ import numpy as np -from pandas import NaT, Series, date_range +from pandas import ( + NA, + Index, + NaT, + Series, + date_range, +) from .pandas_vb_common import tm class SeriesConstructor: - - params = [None, "dict"] - param_names = ["data"] - - def setup(self, data): + def setup(self): self.idx = date_range( start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s" ) - dict_data = dict(zip(self.idx, range(len(self.idx)))) - self.data = None if data is None else dict_data + self.data = dict(zip(self.idx, range(len(self.idx)))) + self.array = np.array([1, 2, 3]) + self.idx2 = Index(["a", "b", "c"]) - def time_constructor(self, data): + def time_constructor_dict(self): Series(data=self.data, index=self.idx) + def time_constructor_no_data(self): + Series(data=None, index=self.idx) -class IsIn: - - params = ["int64", "uint64", "object"] - param_names = ["dtype"] - - def setup(self, dtype): - self.s = Series(np.random.randint(1, 10, 100000)).astype(dtype) - self.values = [1, 2] - - def time_isin(self, dtypes): - self.s.isin(self.values) - + def time_constructor_fastpath(self): + Series(self.array, index=self.idx2, name="name", fastpath=True) -class IsInFloat64: - def setup(self): - self.small = Series([1, 2], dtype=np.float64) - self.many_different_values = np.arange(10 ** 6, dtype=np.float64) - self.few_different_values = np.zeros(10 ** 7, dtype=np.float64) - self.only_nans_values = np.full(10 ** 7, np.nan, dtype=np.float64) - - def time_isin_many_different(self): - # runtime is dominated by creation of the lookup-table - self.small.isin(self.many_different_values) - def time_isin_few_different(self): - # runtime is dominated by creation of the lookup-table - self.small.isin(self.few_different_values) +class ToFrame: + params = [["int64", "datetime64[ns]", "category", "Int64"], [None, "foo"]] + param_names = ["dtype", "name"] - def time_isin_nan_values(self): - # runtime is dominated by creation of the lookup-table - self.small.isin(self.few_different_values) + def setup(self, dtype, name): + arr = np.arange(10**5) + ser = Series(arr, dtype=dtype) + self.ser = ser - -class IsInForObjects: - def setup(self): - self.s_nans = Series(np.full(10 ** 4, np.nan)).astype(object) - self.vals_nans = np.full(10 ** 4, np.nan).astype(object) - self.s_short = Series(np.arange(2)).astype(object) - self.s_long = Series(np.arange(10 ** 5)).astype(object) - self.vals_short = np.arange(2).astype(object) - self.vals_long = np.arange(10 ** 5).astype(object) - # because of nans floats are special: - self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float)).astype(object) - self.vals_long_floats = np.arange(10 ** 5, dtype=np.float).astype(object) - - def time_isin_nans(self): - # if nan-objects are different objects, - # this has the potential to trigger O(n^2) running time - self.s_nans.isin(self.vals_nans) - - def time_isin_short_series_long_values(self): - # running time dominated by the preprocessing - self.s_short.isin(self.vals_long) - - def time_isin_long_series_short_values(self): - # running time dominated by look-up - self.s_long.isin(self.vals_short) - - def time_isin_long_series_long_values(self): - # no dominating part - self.s_long.isin(self.vals_long) - - def time_isin_long_series_long_values_floats(self): - # no dominating part - self.s_long_floats.isin(self.vals_long_floats) + def time_to_frame(self, dtype, name): + self.ser.to_frame(name) class NSort: - params = ["first", "last", "all"] param_names = ["keep"] @@ -106,12 +60,11 @@ def time_nsmallest(self, keep): class Dropna: - params = ["int", "datetime"] param_names = ["dtype"] def setup(self, dtype): - N = 10 ** 6 + N = 10**6 data = { "int": np.random.randint(1, 10, N), "datetime": date_range("2000-01-01", freq="S", periods=N), @@ -124,8 +77,49 @@ def time_dropna(self, dtype): self.s.dropna() -class SearchSorted: +class Fillna: + params = [ + [ + "datetime64[ns]", + "float64", + "Float64", + "Int64", + "int64[pyarrow]", + "string", + "string[pyarrow]", + ], + [None, "pad", "backfill"], + ] + param_names = ["dtype", "method"] + + def setup(self, dtype, method): + N = 10**6 + if dtype == "datetime64[ns]": + data = date_range("2000-01-01", freq="S", periods=N) + na_value = NaT + elif dtype in ("float64", "Float64"): + data = np.random.randn(N) + na_value = np.nan + elif dtype in ("Int64", "int64[pyarrow]"): + data = np.arange(N) + na_value = NA + elif dtype in ("string", "string[pyarrow]"): + data = tm.rands_array(5, N) + na_value = NA + else: + raise NotImplementedError + fill_value = data[0] + ser = Series(data, dtype=dtype) + ser[::2] = na_value + self.ser = ser + self.fill_value = fill_value + + def time_fillna(self, dtype, method): + value = self.fill_value if method is None else None + self.ser.fillna(value=value, method=method) + +class SearchSorted: goal_time = 0.2 params = [ "int8", @@ -144,7 +138,7 @@ class SearchSorted: param_names = ["dtype"] def setup(self, dtype): - N = 10 ** 5 + N = 10**5 data = np.array([1] * N + [2] * N + [3] * N).astype(dtype) self.s = Series(data) @@ -154,7 +148,6 @@ def time_searchsorted(self, dtype): class Map: - params = (["dict", "Series", "lambda"], ["object", "category", "int"]) param_names = "mapper" @@ -180,7 +173,7 @@ def time_map(self, mapper, *args, **kwargs): class Clip: - params = [50, 1000, 10 ** 5] + params = [50, 1000, 10**5] param_names = ["n"] def setup(self, n): @@ -190,18 +183,72 @@ def time_clip(self, n): self.s.clip(0, 1) -class ValueCounts: +class ClipDt: + def setup(self): + dr = date_range("20220101", periods=100_000, freq="s", tz="UTC") + self.clipper_dt = dr[0:1_000].repeat(100) + self.s = Series(dr) - params = ["int", "uint", "float", "object"] - param_names = ["dtype"] + def time_clip(self): + self.s.clip(upper=self.clipper_dt) - def setup(self, dtype): - self.s = Series(np.random.randint(0, 1000, size=100000)).astype(dtype) - def time_value_counts(self, dtype): +class ValueCounts: + params = [[10**3, 10**4, 10**5], ["int", "uint", "float", "object"]] + param_names = ["N", "dtype"] + + def setup(self, N, dtype): + self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype) + + def time_value_counts(self, N, dtype): self.s.value_counts() +class ValueCountsEA: + params = [[10**3, 10**4, 10**5], [True, False]] + param_names = ["N", "dropna"] + + def setup(self, N, dropna): + self.s = Series(np.random.randint(0, N, size=10 * N), dtype="Int64") + self.s.loc[1] = NA + + def time_value_counts(self, N, dropna): + self.s.value_counts(dropna=dropna) + + +class ValueCountsObjectDropNAFalse: + params = [10**3, 10**4, 10**5] + param_names = ["N"] + + def setup(self, N): + self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object") + + def time_value_counts(self, N): + self.s.value_counts(dropna=False) + + +class Mode: + params = [[10**3, 10**4, 10**5], ["int", "uint", "float", "object"]] + param_names = ["N", "dtype"] + + def setup(self, N, dtype): + self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype) + + def time_mode(self, N, dtype): + self.s.mode() + + +class ModeObjectDropNAFalse: + params = [10**3, 10**4, 10**5] + param_names = ["N"] + + def setup(self, N): + self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object") + + def time_mode(self, N): + self.s.mode(dropna=False) + + class Dir: def setup(self): self.s = Series(index=tm.makeStringIndex(10000)) @@ -213,15 +260,14 @@ def time_dir_strings(self): class SeriesGetattr: # https://github.com/pandas-dev/pandas/issues/19764 def setup(self): - self.s = Series(1, index=date_range("2012-01-01", freq="s", periods=int(1e6))) + self.s = Series(1, index=date_range("2012-01-01", freq="s", periods=10**6)) def time_series_datetimeindex_repr(self): getattr(self.s, "a", None) class All: - - params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]] + params = [[10**3, 10**6], ["fast", "slow"], ["bool", "boolean"]] param_names = ["N", "case", "dtype"] def setup(self, N, case, dtype): @@ -233,8 +279,7 @@ def time_all(self, N, case, dtype): class Any: - - params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]] + params = [[10**3, 10**6], ["fast", "slow"], ["bool", "boolean"]] param_names = ["N", "case", "dtype"] def setup(self, N, case, dtype): @@ -246,7 +291,6 @@ def time_any(self, N, case, dtype): class NanOps: - params = [ [ "var", @@ -262,7 +306,7 @@ class NanOps: "kurt", "prod", ], - [10 ** 3, 10 ** 6], + [10**3, 10**6], ["int8", "int32", "int64", "float64", "Int64", "boolean"], ] param_names = ["func", "N", "dtype"] @@ -278,4 +322,97 @@ def time_func(self, func, N, dtype): self.func() +class Rank: + param_names = ["dtype"] + params = [ + ["int", "uint", "float", "object"], + ] + + def setup(self, dtype): + self.s = Series(np.random.randint(0, 1000, size=100000), dtype=dtype) + + def time_rank(self, dtype): + self.s.rank() + + +class Iter: + param_names = ["dtype"] + params = [ + "bool", + "boolean", + "int64", + "Int64", + "float64", + "Float64", + "datetime64[ns]", + ] + + def setup(self, dtype): + N = 10**5 + if dtype in ["bool", "boolean"]: + data = np.repeat([True, False], N // 2) + elif dtype in ["int64", "Int64"]: + data = np.arange(N) + elif dtype in ["float64", "Float64"]: + data = np.random.randn(N) + elif dtype == "datetime64[ns]": + data = date_range("2000-01-01", freq="s", periods=N) + else: + raise NotImplementedError + + self.s = Series(data, dtype=dtype) + + def time_iter(self, dtype): + for v in self.s: + pass + + +class ToNumpy: + def setup(self): + N = 1_000_000 + self.ser = Series( + np.random.randn( + N, + ) + ) + + def time_to_numpy(self): + self.ser.to_numpy() + + def time_to_numpy_double_copy(self): + self.ser.to_numpy(dtype="float64", copy=True) + + def time_to_numpy_copy(self): + self.ser.to_numpy(copy=True) + + +class Replace: + param_names = ["num_to_replace"] + params = [100, 1000] + + def setup(self, num_to_replace): + N = 1_000_000 + self.arr = np.random.randn(N) + self.arr1 = self.arr.copy() + np.random.shuffle(self.arr1) + self.ser = Series(self.arr) + + self.to_replace_list = np.random.choice(self.arr, num_to_replace) + self.values_list = np.random.choice(self.arr1, num_to_replace) + + self.replace_dict = dict(zip(self.to_replace_list, self.values_list)) + + def time_replace_dict(self, num_to_replace): + self.ser.replace(self.replace_dict) + + def peakmem_replace_dict(self, num_to_replace): + self.ser.replace(self.replace_dict) + + def time_replace_list(self, num_to_replace): + self.ser.replace(self.to_replace_list, self.values_list) + + def peakmem_replace_list(self, num_to_replace): + self.ser.replace(self.to_replace_list, self.values_list) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 28ceb25eebd96..c8a9a9e6e9176 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -2,7 +2,11 @@ import scipy.sparse import pandas as pd -from pandas import MultiIndex, Series, date_range +from pandas import ( + MultiIndex, + Series, + date_range, +) from pandas.arrays import SparseArray @@ -24,19 +28,18 @@ def setup(self): data = np.random.randn(N)[:-i] idx = rng[:-i] data[100:] = np.nan - self.series[i] = pd.Series(pd.SparseArray(data), index=idx) + self.series[i] = Series(SparseArray(data), index=idx) def time_series_to_frame(self): pd.DataFrame(self.series) class SparseArrayConstructor: - params = ([0.1, 0.01], [0, np.nan], [np.int64, np.float64, object]) param_names = ["dense_proportion", "fill_value", "dtype"] def setup(self, dense_proportion, fill_value, dtype): - N = 10 ** 6 + N = 10**6 self.array = make_array(N, dense_proportion, fill_value, dtype) def time_sparse_array(self, dense_proportion, fill_value, dtype): @@ -59,29 +62,54 @@ def setup(self): ) def time_sparse_series_from_coo(self): - pd.Series.sparse.from_coo(self.matrix) + Series.sparse.from_coo(self.matrix) class ToCoo: - def setup(self): + params = [True, False] + param_names = ["sort_labels"] + + def setup(self, sort_labels): s = Series([np.nan] * 10000) s[0] = 3.0 s[100] = -1.0 s[999] = 12.1 - s.index = MultiIndex.from_product([range(10)] * 4) - self.ss = s.astype("Sparse") - def time_sparse_series_to_coo(self): - self.ss.sparse.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True) + s_mult_lvl = s.set_axis(MultiIndex.from_product([range(10)] * 4)) + self.ss_mult_lvl = s_mult_lvl.astype("Sparse") + s_two_lvl = s.set_axis(MultiIndex.from_product([range(100)] * 2)) + self.ss_two_lvl = s_two_lvl.astype("Sparse") -class Arithmetic: + def time_sparse_series_to_coo(self, sort_labels): + self.ss_mult_lvl.sparse.to_coo( + row_levels=[0, 1], column_levels=[2, 3], sort_labels=sort_labels + ) + + def time_sparse_series_to_coo_single_level(self, sort_labels): + self.ss_two_lvl.sparse.to_coo(sort_labels=sort_labels) + + +class ToCooFrame: + def setup(self): + N = 10000 + k = 10 + arr = np.zeros((N, k), dtype=float) + arr[0, 0] = 3.0 + arr[12, 7] = -1.0 + arr[0, 9] = 11.2 + self.df = pd.DataFrame(arr, dtype=pd.SparseDtype("float", fill_value=0.0)) + def time_to_coo(self): + self.df.sparse.to_coo() + + +class Arithmetic: params = ([0.1, 0.01], [0, np.nan]) param_names = ["dense_proportion", "fill_value"] def setup(self, dense_proportion, fill_value): - N = 10 ** 6 + N = 10**6 arr1 = make_array(N, dense_proportion, fill_value, np.int64) self.array1 = SparseArray(arr1, fill_value=fill_value) arr2 = make_array(N, dense_proportion, fill_value, np.int64) @@ -101,12 +129,11 @@ def time_divide(self, dense_proportion, fill_value): class ArithmeticBlock: - params = [np.nan, 0] param_names = ["fill_value"] def setup(self, fill_value): - N = 10 ** 6 + N = 10**6 self.arr1 = self.make_block_array( length=N, num_blocks=1000, block_size=10, fill_value=fill_value ) @@ -116,10 +143,10 @@ def setup(self, fill_value): def make_block_array(self, length, num_blocks, block_size, fill_value): arr = np.full(length, fill_value) - indicies = np.random.choice( + indices = np.random.choice( np.arange(0, length, block_size), num_blocks, replace=False ) - for ind in indicies: + for ind in indices: arr[ind : ind + block_size] = np.random.randint(0, 100, block_size) return SparseArray(arr, fill_value=fill_value) @@ -136,4 +163,65 @@ def time_division(self, fill_value): self.arr1 / self.arr2 +class MinMax: + params = (["min", "max"], [0.0, np.nan]) + param_names = ["func", "fill_value"] + + def setup(self, func, fill_value): + N = 1_000_000 + arr = make_array(N, 1e-5, fill_value, np.float64) + self.sp_arr = SparseArray(arr, fill_value=fill_value) + + def time_min_max(self, func, fill_value): + getattr(self.sp_arr, func)() + + +class Take: + params = ([np.array([0]), np.arange(100_000), np.full(100_000, -1)], [True, False]) + param_names = ["indices", "allow_fill"] + + def setup(self, indices, allow_fill): + N = 1_000_000 + fill_value = 0.0 + arr = make_array(N, 1e-5, fill_value, np.float64) + self.sp_arr = SparseArray(arr, fill_value=fill_value) + + def time_take(self, indices, allow_fill): + self.sp_arr.take(indices, allow_fill=allow_fill) + + +class GetItem: + def setup(self): + N = 1_000_000 + d = 1e-5 + arr = make_array(N, d, np.nan, np.float64) + self.sp_arr = SparseArray(arr) + + def time_integer_indexing(self): + self.sp_arr[78] + + def time_slice(self): + self.sp_arr[1:] + + +class GetItemMask: + params = [True, False, np.nan] + param_names = ["fill_value"] + + def setup(self, fill_value): + N = 1_000_000 + d = 1e-5 + arr = make_array(N, d, np.nan, np.float64) + self.sp_arr = SparseArray(arr) + b_arr = np.full(shape=N, fill_value=fill_value, dtype=np.bool_) + fv_inds = np.unique( + np.random.randint(low=0, high=N - 1, size=int(N * d), dtype=np.int32) + ) + b_arr[fv_inds] = True if pd.isna(fill_value) else not fill_value + self.sp_b_arr = SparseArray(b_arr, dtype=np.bool_, fill_value=fill_value) + + def time_mask(self, fill_value): + self.sp_arr[self.sp_b_arr] + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 5639d6702a92c..65bcb3d55c4f1 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -2,18 +2,14 @@ import pandas as pd -ops = ["mean", "sum", "median", "std", "skew", "kurt", "mad", "prod", "sem", "var"] +ops = ["mean", "sum", "median", "std", "skew", "kurt", "prod", "sem", "var"] class FrameOps: - params = [ops, ["float", "int", "Int64"], [0, 1]] param_names = ["op", "dtype", "axis"] def setup(self, op, dtype, axis): - if op == "mad" and dtype == "Int64": - # GH-33036, GH#33600 - raise NotImplementedError values = np.random.randn(100000, 4) if dtype == "Int64": values = values.astype(int) @@ -25,11 +21,10 @@ def time_op(self, op, dtype, axis): class FrameMultiIndexOps: + params = [ops] + param_names = ["op"] - params = ([0, 1, [0, 1]], ops) - param_names = ["level", "op"] - - def setup(self, level, op): + def setup(self, op): levels = [np.arange(10), np.arange(100), np.arange(100)] codes = [ np.arange(10).repeat(10000), @@ -40,12 +35,11 @@ def setup(self, level, op): df = pd.DataFrame(np.random.randn(len(index), 4), index=index) self.df_func = getattr(df, op) - def time_op(self, level, op): - self.df_func(level=level) + def time_op(self, op): + self.df_func() class SeriesOps: - params = [ops, ["float", "int"]] param_names = ["op", "dtype"] @@ -58,11 +52,10 @@ def time_op(self, op, dtype): class SeriesMultiIndexOps: + params = [ops] + param_names = ["op"] - params = ([0, 1, [0, 1]], ops) - param_names = ["level", "op"] - - def setup(self, level, op): + def setup(self, op): levels = [np.arange(10), np.arange(100), np.arange(100)] codes = [ np.arange(10).repeat(10000), @@ -73,17 +66,16 @@ def setup(self, level, op): s = pd.Series(np.random.randn(len(index)), index=index) self.s_func = getattr(s, op) - def time_op(self, level, op): - self.s_func(level=level) + def time_op(self, op): + self.s_func() class Rank: - params = [["DataFrame", "Series"], [True, False]] param_names = ["constructor", "pct"] def setup(self, constructor, pct): - values = np.random.randn(10 ** 5) + values = np.random.randn(10**5) self.data = getattr(pd, constructor)(values) def time_rank(self, constructor, pct): @@ -94,7 +86,6 @@ def time_average_old(self, constructor, pct): class Correlation: - params = [["spearman", "kendall", "pearson"]] param_names = ["method"] @@ -129,7 +120,6 @@ def time_corrwith_rows(self, method): class Covariance: - params = [] param_names = [] diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py new file mode 100644 index 0000000000000..ac1b7f65d2d90 --- /dev/null +++ b/asv_bench/benchmarks/strftime.py @@ -0,0 +1,64 @@ +import numpy as np + +import pandas as pd +from pandas import offsets + + +class DatetimeStrftime: + timeout = 1500 + params = [1000, 10000] + param_names = ["obs"] + + def setup(self, obs): + d = "2018-11-29" + dt = "2018-11-26 11:18:27.0" + self.data = pd.DataFrame( + { + "dt": [np.datetime64(dt)] * obs, + "d": [np.datetime64(d)] * obs, + "r": [np.random.uniform()] * obs, + } + ) + + def time_frame_date_to_str(self, obs): + self.data["d"].astype(str) + + def time_frame_date_formatting_default(self, obs): + self.data["d"].dt.strftime(date_format="%Y-%m-%d") + + def time_frame_date_formatting_custom(self, obs): + self.data["d"].dt.strftime(date_format="%Y---%m---%d") + + def time_frame_datetime_to_str(self, obs): + self.data["dt"].astype(str) + + def time_frame_datetime_formatting_default_date_only(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d") + + def time_frame_datetime_formatting_default(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S") + + def time_frame_datetime_formatting_default_with_float(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f") + + def time_frame_datetime_formatting_custom(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") + + +class BusinessHourStrftime: + timeout = 1500 + params = [1000, 10000] + param_names = ["obs"] + + def setup(self, obs): + self.data = pd.DataFrame( + { + "off": [offsets.BusinessHour()] * obs, + } + ) + + def time_frame_offset_str(self, obs): + self.data["off"].apply(str) + + def time_frame_offset_repr(self, obs): + self.data["off"].apply(repr) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d7fb2775376c0..59b7cd2accf88 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -2,105 +2,185 @@ import numpy as np -from pandas import DataFrame, Series +from pandas import ( + NA, + Categorical, + DataFrame, + Series, +) +from pandas.arrays import StringArray from .pandas_vb_common import tm -class Methods: - def setup(self): - self.s = Series(tm.makeStringIndex(10 ** 5)) +class Dtypes: + params = ["str", "string[python]", "string[pyarrow]"] + param_names = ["dtype"] + + def setup(self, dtype): + try: + self.s = Series(tm.makeStringIndex(10**5), dtype=dtype) + except ImportError: + raise NotImplementedError + + +class Construction: + params = ["str", "string"] + param_names = ["dtype"] + + def setup(self, dtype): + self.series_arr = tm.rands_array(nchars=10, size=10**5) + self.frame_arr = self.series_arr.reshape((50_000, 2)).copy() + + # GH37371. Testing construction of string series/frames from ExtensionArrays + self.series_cat_arr = Categorical(self.series_arr) + self.frame_cat_arr = Categorical(self.frame_arr) + + def time_series_construction(self, dtype): + Series(self.series_arr, dtype=dtype) + + def peakmem_series_construction(self, dtype): + Series(self.series_arr, dtype=dtype) + + def time_frame_construction(self, dtype): + DataFrame(self.frame_arr, dtype=dtype) + + def peakmem_frame_construction(self, dtype): + DataFrame(self.frame_arr, dtype=dtype) + + def time_cat_series_construction(self, dtype): + Series(self.series_cat_arr, dtype=dtype) - def time_center(self): + def peakmem_cat_series_construction(self, dtype): + Series(self.series_cat_arr, dtype=dtype) + + def time_cat_frame_construction(self, dtype): + DataFrame(self.frame_cat_arr, dtype=dtype) + + def peakmem_cat_frame_construction(self, dtype): + DataFrame(self.frame_cat_arr, dtype=dtype) + + +class Methods(Dtypes): + def time_center(self, dtype): self.s.str.center(100) - def time_count(self): + def time_count(self, dtype): self.s.str.count("A") - def time_endswith(self): + def time_endswith(self, dtype): self.s.str.endswith("A") - def time_extract(self): + def time_extract(self, dtype): with warnings.catch_warnings(record=True): self.s.str.extract("(\\w*)A(\\w*)") - def time_findall(self): + def time_findall(self, dtype): self.s.str.findall("[A-Z]+") - def time_find(self): + def time_find(self, dtype): self.s.str.find("[A-Z]+") - def time_rfind(self): + def time_rfind(self, dtype): self.s.str.rfind("[A-Z]+") - def time_get(self): + def time_fullmatch(self, dtype): + self.s.str.fullmatch("A") + + def time_get(self, dtype): self.s.str.get(0) - def time_len(self): + def time_len(self, dtype): self.s.str.len() - def time_join(self): + def time_join(self, dtype): self.s.str.join(" ") - def time_match(self): + def time_match(self, dtype): self.s.str.match("A") - def time_normalize(self): + def time_normalize(self, dtype): self.s.str.normalize("NFC") - def time_pad(self): + def time_pad(self, dtype): self.s.str.pad(100, side="both") - def time_partition(self): + def time_partition(self, dtype): self.s.str.partition("A") - def time_rpartition(self): + def time_rpartition(self, dtype): self.s.str.rpartition("A") - def time_replace(self): + def time_replace(self, dtype): self.s.str.replace("A", "\x01\x01") - def time_translate(self): + def time_translate(self, dtype): self.s.str.translate({"A": "\x01\x01"}) - def time_slice(self): + def time_slice(self, dtype): self.s.str.slice(5, 15, 2) - def time_startswith(self): + def time_startswith(self, dtype): self.s.str.startswith("A") - def time_strip(self): + def time_strip(self, dtype): self.s.str.strip("A") - def time_rstrip(self): + def time_rstrip(self, dtype): self.s.str.rstrip("A") - def time_lstrip(self): + def time_lstrip(self, dtype): self.s.str.lstrip("A") - def time_title(self): + def time_title(self, dtype): self.s.str.title() - def time_upper(self): + def time_upper(self, dtype): self.s.str.upper() - def time_lower(self): + def time_lower(self, dtype): self.s.str.lower() - def time_wrap(self): + def time_wrap(self, dtype): self.s.str.wrap(10) - def time_zfill(self): + def time_zfill(self, dtype): self.s.str.zfill(10) + def time_isalnum(self, dtype): + self.s.str.isalnum() -class Repeat: + def time_isalpha(self, dtype): + self.s.str.isalpha() + + def time_isdecimal(self, dtype): + self.s.str.isdecimal() + + def time_isdigit(self, dtype): + self.s.str.isdigit() + + def time_islower(self, dtype): + self.s.str.islower() + + def time_isnumeric(self, dtype): + self.s.str.isnumeric() + + def time_isspace(self, dtype): + self.s.str.isspace() + def time_istitle(self, dtype): + self.s.str.istitle() + + def time_isupper(self, dtype): + self.s.str.isupper() + + +class Repeat: params = ["int", "array"] param_names = ["repeats"] def setup(self, repeats): - N = 10 ** 5 + N = 10**5 self.s = Series(tm.makeStringIndex(N)) repeat = {"int": 1, "array": np.random.randint(1, 3, N)} self.values = repeat[repeats] @@ -110,12 +190,11 @@ def time_repeat(self, repeats): class Cat: - params = ([0, 3], [None, ","], [None, "-"], [0.0, 0.001, 0.15]) param_names = ["other_cols", "sep", "na_rep", "na_frac"] def setup(self, other_cols, sep, na_rep, na_frac): - N = 10 ** 5 + N = 10**5 mask_gen = lambda: np.random.choice([True, False], N, p=[1 - na_frac, na_frac]) self.s = Series(tm.makeStringIndex(N)).where(mask_gen()) if other_cols == 0: @@ -134,44 +213,56 @@ def time_cat(self, other_cols, sep, na_rep, na_frac): self.s.str.cat(others=self.others, sep=sep, na_rep=na_rep) -class Contains: - - params = [True, False] - param_names = ["regex"] +class Contains(Dtypes): + params = (Dtypes.params, [True, False]) + param_names = ["dtype", "regex"] - def setup(self, regex): - self.s = Series(tm.makeStringIndex(10 ** 5)) + def setup(self, dtype, regex): + super().setup(dtype) - def time_contains(self, regex): + def time_contains(self, dtype, regex): self.s.str.contains("A", regex=regex) -class Split: +class Split(Dtypes): + params = (Dtypes.params, [True, False]) + param_names = ["dtype", "expand"] - params = [True, False] - param_names = ["expand"] + def setup(self, dtype, expand): + super().setup(dtype) + self.s = self.s.str.join("--") - def setup(self, expand): - self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("--") - - def time_split(self, expand): + def time_split(self, dtype, expand): self.s.str.split("--", expand=expand) - def time_rsplit(self, expand): + def time_rsplit(self, dtype, expand): self.s.str.rsplit("--", expand=expand) -class Dummies: - def setup(self): - self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("|") +class Extract(Dtypes): + params = (Dtypes.params, [True, False]) + param_names = ["dtype", "expand"] + + def setup(self, dtype, expand): + super().setup(dtype) + + def time_extract_single_group(self, dtype, expand): + with warnings.catch_warnings(record=True): + self.s.str.extract("(\\w*)A", expand=expand) - def time_get_dummies(self): + +class Dummies(Dtypes): + def setup(self, dtype): + super().setup(dtype) + self.s = self.s.str.join("|") + + def time_get_dummies(self, dtype): self.s.str.get_dummies("|") class Encode: def setup(self): - self.ser = Series(tm.makeUnicodeIndex()) + self.ser = Series(tm.makeStringIndex()) def time_encode_decode(self): self.ser.str.encode("utf-8").str.decode("utf-8") @@ -184,3 +275,24 @@ def setup(self): def time_vector_slice(self): # GH 2602 self.s.str[:5] + + +class Iter(Dtypes): + def time_iter(self, dtype): + for i in self.s: + pass + + +class StringArrayConstruction: + def setup(self): + self.series_arr = tm.rands_array(nchars=10, size=10**5) + self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)]) + + def time_string_array_construction(self): + StringArray(self.series_arr) + + def time_string_array_with_nan_construction(self): + StringArray(self.series_arr_nan) + + def peakmem_stringarray_construction(self): + StringArray(self.series_arr) diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index cfe05c3e257b1..cb0e4455e1a56 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -3,42 +3,11 @@ benchmarks.tslibs.timedelta for benchmarks that rely only on tslibs. """ -import numpy as np - -from pandas import DataFrame, Series, timedelta_range, to_timedelta - - -class ToTimedelta: - def setup(self): - self.ints = np.random.randint(0, 60, size=10000) - self.str_days = [] - self.str_seconds = [] - for i in self.ints: - self.str_days.append(f"{i} days") - self.str_seconds.append(f"00:00:{i:02d}") - - def time_convert_int(self): - to_timedelta(self.ints, unit="s") - - def time_convert_string_days(self): - to_timedelta(self.str_days) - - def time_convert_string_seconds(self): - to_timedelta(self.str_seconds) - - -class ToTimedeltaErrors: - - params = ["coerce", "ignore"] - param_names = ["errors"] - - def setup(self, errors): - ints = np.random.randint(0, 60, size=10000) - self.arr = [f"{i} days" for i in ints] - self.arr[-1] = "apple" - - def time_convert(self, errors): - to_timedelta(self.arr, errors=errors) +from pandas import ( + DataFrame, + Series, + timedelta_range, +) class DatetimeAccessor: @@ -74,7 +43,7 @@ def time_get_loc(self): self.index.get_loc(self.timedelta) def time_shallow_copy(self): - self.index._shallow_copy() + self.index._view() def time_series_loc(self): self.series.loc[self.timedelta] diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index b494dbd8a38fa..1253fefde2d5f 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -3,7 +3,13 @@ import dateutil import numpy as np -from pandas import DataFrame, Series, date_range, period_range, to_datetime +from pandas import ( + DataFrame, + Series, + date_range, + period_range, + timedelta_range, +) from pandas.tseries.frequencies import infer_freq @@ -14,7 +20,6 @@ class DatetimeIndex: - params = ["dst", "repeated", "tz_aware", "tz_local", "tz_naive"] param_names = ["index_type"] @@ -62,7 +67,6 @@ def time_is_dates_only(self, index_type): class TzLocalize: - params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()] param_names = "tz" @@ -82,7 +86,6 @@ def time_infer_dst(self, tz): class ResetIndex: - params = [None, "US/Eastern"] param_names = "tz" @@ -90,12 +93,12 @@ def setup(self, tz): idx = date_range(start="1/1/2000", periods=1000, freq="H", tz=tz) self.df = DataFrame(np.random.randn(1000, 2), index=idx) - def time_reest_datetimeindex(self, tz): + def time_reset_datetimeindex(self, tz): self.df.reset_index() class InferFreq: - + # This depends mostly on code in _libs/, tseries/, and core.algos.unique params = [None, "D", "B"] param_names = ["freq"] @@ -120,13 +123,15 @@ def time_convert(self): class Iteration: - - params = [date_range, period_range] + params = [date_range, period_range, timedelta_range] param_names = ["time_index"] def setup(self, time_index): - N = 10 ** 6 - self.idx = time_index(start="20140101", freq="T", periods=N) + N = 10**6 + if time_index is timedelta_range: + self.idx = time_index(start=0, freq="T", periods=N) + else: + self.idx = time_index(start="20140101", freq="T", periods=N) self.exit = 10000 def time_iter(self, time_index): @@ -140,7 +145,6 @@ def time_iter_preexit(self, time_index): class ResampleDataFrame: - params = ["max", "mean", "min"] param_names = ["method"] @@ -154,7 +158,6 @@ def time_method(self, method): class ResampleSeries: - params = (["period", "datetime"], ["5min", "1D"], ["mean", "ohlc"]) param_names = ["index", "freq", "method"] @@ -184,7 +187,6 @@ def time_resample(self): class AsOf: - params = ["DataFrame", "Series"] param_names = ["constructor"] @@ -233,12 +235,11 @@ def time_asof_nan_single(self, constructor): class SortIndex: - params = [True, False] param_names = ["monotonic"] def setup(self, monotonic): - N = 10 ** 5 + N = 10**5 idx = date_range(start="1/1/2000", periods=N, freq="s") self.s = Series(np.random.randn(N), index=idx) if not monotonic: @@ -263,137 +264,7 @@ def time_lookup_and_cleanup(self): self.ts.index._cleanup() -class ToDatetimeYYYYMMDD: - def setup(self): - rng = date_range(start="1/1/2000", periods=10000, freq="D") - self.stringsD = Series(rng.strftime("%Y%m%d")) - - def time_format_YYYYMMDD(self): - to_datetime(self.stringsD, format="%Y%m%d") - - -class ToDatetimeCacheSmallCount: - - params = ([True, False], [50, 500, 5000, 100000]) - param_names = ["cache", "count"] - - def setup(self, cache, count): - rng = date_range(start="1/1/1971", periods=count) - self.unique_date_strings = rng.strftime("%Y-%m-%d").tolist() - - def time_unique_date_strings(self, cache, count): - to_datetime(self.unique_date_strings, cache=cache) - - -class ToDatetimeISO8601: - def setup(self): - rng = date_range(start="1/1/2000", periods=20000, freq="H") - self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() - self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist() - self.strings_tz_space = [ - x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng - ] - - def time_iso8601(self): - to_datetime(self.strings) - - def time_iso8601_nosep(self): - to_datetime(self.strings_nosep) - - def time_iso8601_format(self): - to_datetime(self.strings, format="%Y-%m-%d %H:%M:%S") - - def time_iso8601_format_no_sep(self): - to_datetime(self.strings_nosep, format="%Y%m%d %H:%M:%S") - - def time_iso8601_tz_spaceformat(self): - to_datetime(self.strings_tz_space) - - -class ToDatetimeNONISO8601: - def setup(self): - N = 10000 - half = int(N / 2) - ts_string_1 = "March 1, 2018 12:00:00+0400" - ts_string_2 = "March 1, 2018 12:00:00+0500" - self.same_offset = [ts_string_1] * N - self.diff_offset = [ts_string_1] * half + [ts_string_2] * half - - def time_same_offset(self): - to_datetime(self.same_offset) - - def time_different_offset(self): - to_datetime(self.diff_offset) - - -class ToDatetimeFormatQuarters: - def setup(self): - self.s = Series(["2Q2005", "2Q05", "2005Q1", "05Q1"] * 10000) - - def time_infer_quarter(self): - to_datetime(self.s) - - -class ToDatetimeFormat: - def setup(self): - N = 100000 - self.s = Series(["19MAY11", "19MAY11:00:00:00"] * N) - self.s2 = self.s.str.replace(":\\S+$", "") - - self.same_offset = ["10/11/2018 00:00:00.045-07:00"] * N - self.diff_offset = [ - f"10/11/2018 00:00:00.045-0{offset}:00" for offset in range(10) - ] * int(N / 10) - - def time_exact(self): - to_datetime(self.s2, format="%d%b%y") - - def time_no_exact(self): - to_datetime(self.s, format="%d%b%y", exact=False) - - def time_same_offset(self): - to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z") - - def time_different_offset(self): - to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z") - - def time_same_offset_to_utc(self): - to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) - - def time_different_offset_to_utc(self): - to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) - - -class ToDatetimeCache: - - params = [True, False] - param_names = ["cache"] - - def setup(self, cache): - N = 10000 - self.unique_numeric_seconds = list(range(N)) - self.dup_numeric_seconds = [1000] * N - self.dup_string_dates = ["2000-02-11"] * N - self.dup_string_with_tz = ["2000-02-11 15:00:00-0800"] * N - - def time_unique_seconds_and_unit(self, cache): - to_datetime(self.unique_numeric_seconds, unit="s", cache=cache) - - def time_dup_seconds_and_unit(self, cache): - to_datetime(self.dup_numeric_seconds, unit="s", cache=cache) - - def time_dup_string_dates(self, cache): - to_datetime(self.dup_string_dates, cache=cache) - - def time_dup_string_dates_and_format(self, cache): - to_datetime(self.dup_string_dates, format="%Y-%m-%d", cache=cache) - - def time_dup_string_tzoffset_dates(self, cache): - to_datetime(self.dup_string_with_tz, cache=cache) - - class DatetimeAccessor: - params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()] param_names = "tz" diff --git a/asv_bench/benchmarks/tslibs/fields.py b/asv_bench/benchmarks/tslibs/fields.py index 0607a799ec707..3a2baec54109a 100644 --- a/asv_bench/benchmarks/tslibs/fields.py +++ b/asv_bench/benchmarks/tslibs/fields.py @@ -12,7 +12,7 @@ class TimeGetTimedeltaField: params = [ _sizes, - ["days", "h", "s", "seconds", "ms", "microseconds", "us", "ns", "nanoseconds"], + ["seconds", "microseconds", "nanoseconds"], ] param_names = ["size", "field"] diff --git a/asv_bench/benchmarks/tslibs/normalize.py b/asv_bench/benchmarks/tslibs/normalize.py index 7d4e0556f4d96..b263ae21422b6 100644 --- a/asv_bench/benchmarks/tslibs/normalize.py +++ b/asv_bench/benchmarks/tslibs/normalize.py @@ -1,5 +1,8 @@ try: - from pandas._libs.tslibs import normalize_i8_timestamps, is_date_array_normalized + from pandas._libs.tslibs import ( + is_date_array_normalized, + normalize_i8_timestamps, + ) except ImportError: from pandas._libs.tslibs.conversion import ( normalize_i8_timestamps, @@ -8,7 +11,11 @@ import pandas as pd -from .tslib import _sizes, _tzs +from .tslib import ( + _sizes, + _tzs, + tzlocal_obj, +) class Normalize: @@ -24,9 +31,15 @@ def setup(self, size, tz): dti = pd.date_range("2016-01-01", periods=10, tz=tz).repeat(size // 10) self.i8data = dti.asi8 + if size == 10**6 and tz is tzlocal_obj: + # tzlocal is cumbersomely slow, so skip to keep runtime in check + raise NotImplementedError + def time_normalize_i8_timestamps(self, size, tz): - normalize_i8_timestamps(self.i8data, tz) + # 10 i.e. NPY_FR_ns + normalize_i8_timestamps(self.i8data, tz, 10) def time_is_date_array_normalized(self, size, tz): # TODO: cases with different levels of short-circuiting - is_date_array_normalized(self.i8data, tz) + # 10 i.e. NPY_FR_ns + is_date_array_normalized(self.i8data, tz, 10) diff --git a/asv_bench/benchmarks/tslibs/offsets.py b/asv_bench/benchmarks/tslibs/offsets.py index fc1efe63307b2..1f48ec504acf1 100644 --- a/asv_bench/benchmarks/tslibs/offsets.py +++ b/asv_bench/benchmarks/tslibs/offsets.py @@ -9,12 +9,12 @@ from pandas import offsets try: - import pandas.tseries.holiday # noqa + import pandas.tseries.holiday except ImportError: pass hcal = pandas.tseries.holiday.USFederalHolidayCalendar() -# These offsets currently raise a NotImplimentedError with .apply_index() +# These offsets currently raise a NotImplementedError with .apply_index() non_apply = [ offsets.Day(), offsets.BYearEnd(), @@ -45,7 +45,6 @@ class OnOffset: - params = offset_objs param_names = ["offset"] @@ -63,7 +62,6 @@ def time_on_offset(self, offset): class OffestDatetimeArithmetic: - params = offset_objs param_names = ["offset"] @@ -71,11 +69,8 @@ def setup(self, offset): self.date = datetime(2011, 1, 1) self.dt64 = np.datetime64("2011-01-01 09:00Z") - def time_apply(self, offset): - offset.apply(self.date) - - def time_apply_np_dt64(self, offset): - offset.apply(self.dt64) + def time_add_np_dt64(self, offset): + offset + self.dt64 def time_add(self, offset): self.date + offset diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py index 849e8ec864ac2..2d192889c39f3 100644 --- a/asv_bench/benchmarks/tslibs/period.py +++ b/asv_bench/benchmarks/tslibs/period.py @@ -1,15 +1,22 @@ """ -Period benchmarks that rely only on tslibs. See benchmarks.period for -Period benchmarks that rely on other parts fo pandas. +Period benchmarks that rely only on tslibs. See benchmarks.period for +Period benchmarks that rely on other parts of pandas. """ import numpy as np -from pandas._libs.tslibs.period import Period, periodarr_to_dt64arr +from pandas._libs.tslibs.period import ( + Period, + periodarr_to_dt64arr, +) from pandas.tseries.frequencies import to_offset -from .tslib import _sizes, _tzs +from .tslib import ( + _sizes, + _tzs, + tzlocal_obj, +) try: from pandas._libs.tslibs.vectorized import dt64arr_to_periodarr @@ -18,7 +25,6 @@ class PeriodProperties: - params = ( ["M", "min"], [ @@ -49,7 +55,6 @@ def time_property(self, freq, attr): class PeriodUnaryMethods: - params = ["M", "min"] param_names = ["freq"] @@ -123,6 +128,10 @@ class TimeDT64ArrToPeriodArr: param_names = ["size", "freq", "tz"] def setup(self, size, freq, tz): + if size == 10**6 and tz is tzlocal_obj: + # tzlocal is cumbersomely slow, so skip to keep runtime in check + raise NotImplementedError + arr = np.arange(10, dtype="i8").repeat(size // 10) self.i8values = arr diff --git a/asv_bench/benchmarks/tslibs/resolution.py b/asv_bench/benchmarks/tslibs/resolution.py index 280be7932d4db..44f288c7de216 100644 --- a/asv_bench/benchmarks/tslibs/resolution.py +++ b/asv_bench/benchmarks/tslibs/resolution.py @@ -17,34 +17,33 @@ df.loc[key] = (val.average, val.stdev) """ -from datetime import timedelta, timezone - -from dateutil.tz import gettz, tzlocal import numpy as np -import pytz try: from pandas._libs.tslibs import get_resolution except ImportError: from pandas._libs.tslibs.resolution import get_resolution +from .tslib import ( + _sizes, + _tzs, + tzlocal_obj, +) + class TimeResolution: params = ( ["D", "h", "m", "s", "us", "ns"], - [1, 100, 10 ** 4, 10 ** 6], - [ - None, - timezone.utc, - timezone(timedelta(minutes=60)), - pytz.timezone("US/Pacific"), - gettz("Asia/Tokyo"), - tzlocal(), - ], + _sizes, + _tzs, ) param_names = ["unit", "size", "tz"] def setup(self, unit, size, tz): + if size == 10**6 and tz is tzlocal_obj: + # tzlocal is cumbersomely slow, so skip to keep runtime in check + raise NotImplementedError + arr = np.random.randint(0, 10, size=size, dtype="i8") arr = arr.view(f"M8[{unit}]").astype("M8[ns]").view("i8") self.i8data = arr diff --git a/asv_bench/benchmarks/tslibs/timedelta.py b/asv_bench/benchmarks/tslibs/timedelta.py index 6ed273281569b..2daf1861eb80a 100644 --- a/asv_bench/benchmarks/tslibs/timedelta.py +++ b/asv_bench/benchmarks/tslibs/timedelta.py @@ -1,6 +1,6 @@ """ -Timedelta benchmarks that rely only on tslibs. See benchmarks.timedeltas for -Timedelta benchmarks that rely on other parts fo pandas. +Timedelta benchmarks that rely only on tslibs. See benchmarks.timedeltas for +Timedelta benchmarks that rely on other parts of pandas. """ import datetime diff --git a/asv_bench/benchmarks/tslibs/timestamp.py b/asv_bench/benchmarks/tslibs/timestamp.py index 40f8e561f5238..d7706a39dfae5 100644 --- a/asv_bench/benchmarks/tslibs/timestamp.py +++ b/asv_bench/benchmarks/tslibs/timestamp.py @@ -1,22 +1,11 @@ -from datetime import datetime, timedelta, timezone +from datetime import datetime -from dateutil.tz import gettz, tzlocal, tzutc import numpy as np import pytz from pandas import Timestamp -# One case for each type of tzinfo object that has its own code path -# in tzconversion code. -_tzs = [ - None, - pytz.timezone("Europe/Amsterdam"), - gettz("US/Central"), - pytz.UTC, - tzutc(), - timezone(timedelta(minutes=60)), - tzlocal(), -] +from .tslib import _tzs class TimestampConstruction: @@ -61,62 +50,58 @@ def time_from_pd_timestamp(self): class TimestampProperties: - _freqs = [None, "B"] - params = [_tzs, _freqs] - param_names = ["tz", "freq"] + params = [_tzs] + param_names = ["tz"] - def setup(self, tz, freq): - self.ts = Timestamp("2017-08-25 08:16:14", tzinfo=tz, freq=freq) + def setup(self, tz): + self.ts = Timestamp("2017-08-25 08:16:14", tzinfo=tz) - def time_tz(self, tz, freq): + def time_tz(self, tz): self.ts.tz - def time_dayofweek(self, tz, freq): + def time_dayofweek(self, tz): self.ts.dayofweek - def time_dayofyear(self, tz, freq): + def time_dayofyear(self, tz): self.ts.dayofyear - def time_week(self, tz, freq): + def time_week(self, tz): self.ts.week - def time_quarter(self, tz, freq): + def time_quarter(self, tz): self.ts.quarter - def time_days_in_month(self, tz, freq): + def time_days_in_month(self, tz): self.ts.days_in_month - def time_freqstr(self, tz, freq): - self.ts.freqstr - - def time_is_month_start(self, tz, freq): + def time_is_month_start(self, tz): self.ts.is_month_start - def time_is_month_end(self, tz, freq): + def time_is_month_end(self, tz): self.ts.is_month_end - def time_is_quarter_start(self, tz, freq): + def time_is_quarter_start(self, tz): self.ts.is_quarter_start - def time_is_quarter_end(self, tz, freq): + def time_is_quarter_end(self, tz): self.ts.is_quarter_end - def time_is_year_start(self, tz, freq): + def time_is_year_start(self, tz): self.ts.is_year_start - def time_is_year_end(self, tz, freq): + def time_is_year_end(self, tz): self.ts.is_year_end - def time_is_leap_year(self, tz, freq): + def time_is_leap_year(self, tz): self.ts.is_leap_year - def time_microsecond(self, tz, freq): + def time_microsecond(self, tz): self.ts.microsecond - def time_month_name(self, tz, freq): + def time_month_name(self, tz): self.ts.month_name() - def time_weekday_name(self, tz, freq): + def time_weekday_name(self, tz): self.ts.day_name() diff --git a/asv_bench/benchmarks/tslibs/tslib.py b/asv_bench/benchmarks/tslibs/tslib.py index 5952a402bf89a..97ec80201dd16 100644 --- a/asv_bench/benchmarks/tslibs/tslib.py +++ b/asv_bench/benchmarks/tslibs/tslib.py @@ -15,9 +15,15 @@ val = %timeit -o tr.time_ints_to_pydatetime(box, size, tz) df.loc[key] = (val.average, val.stdev) """ -from datetime import timedelta, timezone +from datetime import ( + timedelta, + timezone, +) -from dateutil.tz import gettz, tzlocal +from dateutil.tz import ( + gettz, + tzlocal, +) import numpy as np import pytz @@ -26,15 +32,16 @@ except ImportError: from pandas._libs.tslib import ints_to_pydatetime +tzlocal_obj = tzlocal() _tzs = [ None, timezone.utc, timezone(timedelta(minutes=60)), pytz.timezone("US/Pacific"), gettz("Asia/Tokyo"), - tzlocal(), + tzlocal_obj, ] -_sizes = [0, 1, 100, 10 ** 4, 10 ** 6] +_sizes = [0, 1, 100, 10**4, 10**6] class TimeIntsToPydatetime: @@ -44,15 +51,18 @@ class TimeIntsToPydatetime: _tzs, ) param_names = ["box", "size", "tz"] - # TODO: fold? freq? + # TODO: fold? def setup(self, box, size, tz): + if box == "date" and tz is not None: + # tz is ignored, so avoid running redundant benchmarks + raise NotImplementedError # skip benchmark + if size == 10**6 and tz is _tzs[-1]: + # This is cumbersomely-slow, so skip to trim runtime + raise NotImplementedError # skip benchmark + arr = np.random.randint(0, 10, size=size, dtype="i8") self.i8data = arr def time_ints_to_pydatetime(self, box, size, tz): - if box == "date": - # ints_to_pydatetime does not allow non-None tz with date; - # this will mean doing some duplicate benchmarks - tz = None ints_to_pydatetime(self.i8data, tz, box=box) diff --git a/asv_bench/benchmarks/tslibs/tz_convert.py b/asv_bench/benchmarks/tslibs/tz_convert.py index c2c90024ca5bd..c6b510efdca69 100644 --- a/asv_bench/benchmarks/tslibs/tz_convert.py +++ b/asv_bench/benchmarks/tslibs/tz_convert.py @@ -3,14 +3,22 @@ from pandas._libs.tslibs.tzconversion import tz_localize_to_utc -from .tslib import _sizes, _tzs +from .tslib import ( + _sizes, + _tzs, + tzlocal_obj, +) try: old_sig = False - from pandas._libs.tslibs.tzconversion import tz_convert_from_utc + from pandas._libs.tslibs import tz_convert_from_utc except ImportError: - old_sig = True - from pandas._libs.tslibs.tzconversion import tz_convert as tz_convert_from_utc + try: + old_sig = False + from pandas._libs.tslibs.tzconversion import tz_convert_from_utc + except ImportError: + old_sig = True + from pandas._libs.tslibs.tzconversion import tz_convert as tz_convert_from_utc class TimeTZConvert: @@ -21,6 +29,10 @@ class TimeTZConvert: param_names = ["size", "tz"] def setup(self, size, tz): + if size == 10**6 and tz is tzlocal_obj: + # tzlocal is cumbersomely slow, so skip to keep runtime in check + raise NotImplementedError + arr = np.random.randint(0, 10, size=size, dtype="i8") self.i8data = arr @@ -28,9 +40,6 @@ def time_tz_convert_from_utc(self, size, tz): # effectively: # dti = DatetimeIndex(self.i8data, tz=tz) # dti.tz_localize(None) - if size >= 10 ** 6 and str(tz) == "tzlocal()": - # asv fill will because each call takes 8+seconds - return if old_sig: tz_convert_from_utc(self.i8data, UTC, tz) else: diff --git a/azure-pipelines.yml b/azure-pipelines.yml deleted file mode 100644 index e45cafc02cb61..0000000000000 --- a/azure-pipelines.yml +++ /dev/null @@ -1,26 +0,0 @@ -# Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml -trigger: -- master - -pr: -- master - -variables: - PYTEST_WORKERS: auto - -jobs: -# Mac and Linux use the same template -- template: ci/azure/posix.yml - parameters: - name: macOS - vmImage: macOS-10.14 - -- template: ci/azure/posix.yml - parameters: - name: Linux - vmImage: ubuntu-16.04 - -- template: ci/azure/windows.yml - parameters: - name: Windows - vmImage: vs2017-win2016 diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml deleted file mode 100644 index f716974f6add1..0000000000000 --- a/ci/azure/posix.yml +++ /dev/null @@ -1,101 +0,0 @@ -parameters: - name: '' - vmImage: '' - -jobs: -- job: ${{ parameters.name }} - pool: - vmImage: ${{ parameters.vmImage }} - strategy: - matrix: - ${{ if eq(parameters.name, 'macOS') }}: - py36_macos: - ENV_FILE: ci/deps/azure-macos-36.yaml - CONDA_PY: "36" - PATTERN: "not slow and not network" - - ${{ if eq(parameters.name, 'Linux') }}: - py36_minimum_versions: - ENV_FILE: ci/deps/azure-36-minimum_versions.yaml - CONDA_PY: "36" - PATTERN: "not slow and not network and not clipboard" - - py36_locale_slow_old_np: - ENV_FILE: ci/deps/azure-36-locale_slow.yaml - CONDA_PY: "36" - PATTERN: "slow" - # pandas does not use the language (zh_CN), but should support different encodings (utf8) - # we should test with encodings different than utf8, but doesn't seem like Ubuntu supports any - LANG: "zh_CN.utf8" - LC_ALL: "zh_CN.utf8" - EXTRA_APT: "language-pack-zh-hans" - - py36_slow: - ENV_FILE: ci/deps/azure-36-slow.yaml - CONDA_PY: "36" - PATTERN: "slow" - - py36_locale: - ENV_FILE: ci/deps/azure-36-locale.yaml - CONDA_PY: "36" - PATTERN: "not slow and not network" - LANG: "it_IT.utf8" - LC_ALL: "it_IT.utf8" - EXTRA_APT: "language-pack-it xsel" - - #py36_32bit: - # ENV_FILE: ci/deps/azure-36-32bit.yaml - # CONDA_PY: "36" - # PATTERN: "not slow and not network and not clipboard" - # BITS32: "yes" - - py37_locale: - ENV_FILE: ci/deps/azure-37-locale.yaml - CONDA_PY: "37" - PATTERN: "not slow and not network" - LANG: "zh_CN.utf8" - LC_ALL: "zh_CN.utf8" - EXTRA_APT: "language-pack-zh-hans xsel" - - py37_np_dev: - ENV_FILE: ci/deps/azure-37-numpydev.yaml - CONDA_PY: "37" - PATTERN: "not slow and not network" - TEST_ARGS: "-W error" - PANDAS_TESTING_MODE: "deprecate" - EXTRA_APT: "xsel" - - steps: - - script: | - if [ "$(uname)" == "Linux" ]; then - sudo apt-get update - sudo apt-get install -y libc6-dev-i386 $EXTRA_APT - fi - displayName: 'Install extra packages' - - - script: echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' - displayName: 'Set conda path' - - - script: ci/setup_env.sh - displayName: 'Setup environment and build pandas' - - - script: | - source activate pandas-dev - ci/run_tests.sh - displayName: 'Test' - - - script: source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - displayName: 'Build versions' - - - task: PublishTestResults@2 - condition: succeededOrFailed() - inputs: - failTaskOnFailedTests: true - testResultsFiles: 'test-data.xml' - testRunTitle: ${{ format('{0}-$(CONDA_PY)', parameters.name) }} - displayName: 'Publish test results' - - - script: | - source activate pandas-dev - python ci/print_skipped.py - displayName: 'Print skipped tests' diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml deleted file mode 100644 index 87f1bfd2adb79..0000000000000 --- a/ci/azure/windows.yml +++ /dev/null @@ -1,57 +0,0 @@ -parameters: - name: '' - vmImage: '' - -jobs: -- job: ${{ parameters.name }} - pool: - vmImage: ${{ parameters.vmImage }} - strategy: - matrix: - py36_np15: - ENV_FILE: ci/deps/azure-windows-36.yaml - CONDA_PY: "36" - PATTERN: "not slow and not network" - - py37_np18: - ENV_FILE: ci/deps/azure-windows-37.yaml - CONDA_PY: "37" - PATTERN: "not slow and not network" - - steps: - - powershell: | - Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" - Write-Host "##vso[task.prependpath]$HOME/miniconda3/bin" - displayName: 'Add conda to PATH' - - - script: conda update -q -n base conda - displayName: 'Update conda' - - - bash: | - conda env create -q --file ci\\deps\\azure-windows-$(CONDA_PY).yaml - displayName: 'Create anaconda environment' - - - bash: | - source activate pandas-dev - conda list - python setup.py build_ext -q -i -j 4 - python -m pip install --no-build-isolation -e . - displayName: 'Build' - - - bash: | - source activate pandas-dev - ci/run_tests.sh - displayName: 'Test' - - - task: PublishTestResults@2 - condition: succeededOrFailed() - inputs: - failTaskOnFailedTests: true - testResultsFiles: 'test-data.xml' - testRunTitle: ${{ format('{0}-$(CONDA_PY)', parameters.name) }} - displayName: 'Publish test results' - - - bash: | - source activate pandas-dev - python ci/print_skipped.py - displayName: 'Print skipped tests' diff --git a/ci/build39.sh b/ci/build39.sh deleted file mode 100755 index f85e1c7def206..0000000000000 --- a/ci/build39.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -e -# Special build for python3.9 until numpy puts its own wheels up - -sudo apt-get install build-essential gcc xvfb -pip install --no-deps -U pip wheel setuptools -pip install python-dateutil pytz pytest pytest-xdist hypothesis -pip install cython --pre # https://github.com/cython/cython/issues/3395 - -git clone https://github.com/numpy/numpy -cd numpy -python setup.py build_ext --inplace -python setup.py install -cd .. -rm -rf numpy - -python setup.py build_ext -inplace -python -m pip install --no-build-isolation -e . - -python -c "import sys; print(sys.version_info)" -python -c "import pandas as pd" -python -c "import hypothesis" diff --git a/ci/check_cache.sh b/ci/check_cache.sh deleted file mode 100755 index b83144fc45ef4..0000000000000 --- a/ci/check_cache.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -# currently not used -# script to make sure that cache is clean -# Travis CI now handles this - -if [ "$TRAVIS_PULL_REQUEST" == "false" ] -then - echo "Not a PR: checking for changes in ci/ from last 2 commits" - git diff HEAD~2 --numstat | grep -E "ci/" - ci_changes=$(git diff HEAD~2 --numstat | grep -E "ci/"| wc -l) -else - echo "PR: checking for changes in ci/ from last 2 commits" - git fetch origin pull/${TRAVIS_PULL_REQUEST}/head:PR_HEAD - git diff PR_HEAD~2 --numstat | grep -E "ci/" - ci_changes=$(git diff PR_HEAD~2 --numstat | grep -E "ci/"| wc -l) -fi - -CACHE_DIR="$HOME/.cache/" -CCACHE_DIR="$HOME/.ccache/" - -if [ $ci_changes -ne 0 ] -then - echo "Files have changed in ci/ deleting all caches" - rm -rf "$CACHE_DIR" - rm -rf "$CCACHE_DIR" -fi diff --git a/ci/check_git_tags.sh b/ci/check_git_tags.sh deleted file mode 100755 index 9dbcd4f98683e..0000000000000 --- a/ci/check_git_tags.sh +++ /dev/null @@ -1,28 +0,0 @@ -set -e - -if [[ ! $(git tag) ]]; then - echo "No git tags in clone, please sync your git tags with upstream using:" - echo " git fetch --tags upstream" - echo " git push --tags origin" - echo "" - echo "If the issue persists, the clone depth needs to be increased in .travis.yml" - exit 1 -fi - -# This will error if there are no tags and we omit --always -DESCRIPTION=$(git describe --long --tags) -echo "$DESCRIPTION" - -if [[ "$DESCRIPTION" == *"untagged"* ]]; then - echo "Unable to determine most recent tag, aborting build" - exit 1 -else - if [[ "$DESCRIPTION" != *"g"* ]]; then - # A good description will have the hash prefixed by g, a bad one will be - # just the hash - echo "Unable to determine most recent tag, aborting build" - exit 1 - else - echo "$(git tag)" - fi -fi diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7b12de387d648..0333975ed269f 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -3,23 +3,19 @@ # Run checks related to code quality. # # This script is intended for both the CI and to check locally that code standards are -# respected. We are currently linting (PEP-8 and similar), looking for patterns of -# common mistakes (sphinx directives with missing blank lines, old style classes, -# unwanted imports...), we run doctests here (currently some files only), and we +# respected. We run doctests here (currently some files only), and we # validate formatting error in docstrings. # # Usage: # $ ./ci/code_checks.sh # run all checks -# $ ./ci/code_checks.sh lint # run linting only -# $ ./ci/code_checks.sh patterns # check for patterns that should not exist # $ ./ci/code_checks.sh code # checks on imported code # $ ./ci/code_checks.sh doctests # run doctests # $ ./ci/code_checks.sh docstrings # validate docstring errors -# $ ./ci/code_checks.sh dependencies # check that dependencies are consistent -# $ ./ci/code_checks.sh typing # run static type analysis +# $ ./ci/code_checks.sh single-docs # check single-page docs build warning-free +# $ ./ci/code_checks.sh notebooks # check execution of documentation notebooks -[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "dependencies" || "$1" == "typing" ]] || \ - { echo "Unknown command $1. Usage: $0 [lint|patterns|code|doctests|docstrings|dependencies|typing]"; exit 9999; } +[[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \ + { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; } BASE_DIR="$(dirname $0)/.." RET=0 @@ -39,210 +35,7 @@ function invgrep { } if [[ "$GITHUB_ACTIONS" == "true" ]]; then - FLAKE8_FORMAT="##[error]%(path)s:%(row)s:%(col)s:%(code)s:%(text)s" INVGREP_PREPEND="##[error]" -else - FLAKE8_FORMAT="default" -fi - -### LINTING ### -if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then - - echo "black --version" - black --version - - MSG='Checking black formatting' ; echo $MSG - black . --check - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # `setup.cfg` contains the list of error codes that are being ignored in flake8 - - echo "flake8 --version" - flake8 --version - - # pandas/_libs/src is C code, so no need to search there. - MSG='Linting .py code' ; echo $MSG - flake8 --format="$FLAKE8_FORMAT" . - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Linting .pyx and .pxd code' ; echo $MSG - flake8 --format="$FLAKE8_FORMAT" pandas --append-config=flake8/cython.cfg - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Linting .pxi.in' ; echo $MSG - flake8 --format="$FLAKE8_FORMAT" pandas/_libs --append-config=flake8/cython-template.cfg - RET=$(($RET + $?)) ; echo $MSG "DONE" - - echo "flake8-rst --version" - flake8-rst --version - - MSG='Linting code-blocks in .rst documentation' ; echo $MSG - flake8-rst doc/source --filename=*.rst --format="$FLAKE8_FORMAT" - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # Check that cython casting is of the form `obj` as opposed to ` obj`; - # it doesn't make a difference, but we want to be internally consistent. - # Note: this grep pattern is (intended to be) equivalent to the python - # regex r'(?])> ' - MSG='Linting .pyx code for spacing conventions in casting' ; echo $MSG - invgrep -r -E --include '*.pyx' --include '*.pxi.in' '[a-zA-Z0-9*]> ' pandas/_libs - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # readability/casting: Warnings about C casting instead of C++ casting - # runtime/int: Warnings about using C number types instead of C++ ones - # build/include_subdir: Warnings about prefacing included header files with directory - - # We don't lint all C files because we don't want to lint any that are built - # from Cython files nor do we want to lint C files that we didn't modify for - # this particular codebase (e.g. src/headers, src/klib). However, - # we can lint all header files since they aren't "generated" like C files are. - MSG='Linting .c and .h' ; echo $MSG - cpplint --quiet --extensions=c,h --headers=h --recursive --filter=-readability/casting,-runtime/int,-build/include_subdir pandas/_libs/src/*.h pandas/_libs/src/parser pandas/_libs/ujson pandas/_libs/tslibs/src/datetime pandas/_libs/*.cpp - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for use of not concatenated strings' ; echo $MSG - if [[ "$GITHUB_ACTIONS" == "true" ]]; then - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_to_concatenate" --format="##[error]{source_path}:{line_number}:{msg}" . - else - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_to_concatenate" . - fi - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for strings with wrong placed spaces' ; echo $MSG - if [[ "$GITHUB_ACTIONS" == "true" ]]; then - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" --format="##[error]{source_path}:{line_number}:{msg}" . - else - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" . - fi - RET=$(($RET + $?)) ; echo $MSG "DONE" - - echo "isort --version-number" - isort --version-number - - # Imports - Check formatting using isort see setup.cfg for settings - MSG='Check import format using isort' ; echo $MSG - ISORT_CMD="isort --quiet --recursive --check-only pandas asv_bench scripts" - if [[ "$GITHUB_ACTIONS" == "true" ]]; then - eval $ISORT_CMD | awk '{print "##[error]" $0}'; RET=$(($RET + ${PIPESTATUS[0]})) - else - eval $ISORT_CMD - fi - RET=$(($RET + $?)) ; echo $MSG "DONE" - -fi - -### PATTERNS ### -if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then - - # Check for imports from pandas.core.common instead of `import pandas.core.common as com` - # Check for imports from collections.abc instead of `from collections import abc` - MSG='Check for non-standard imports' ; echo $MSG - invgrep -R --include="*.py*" -E "from pandas.core.common import" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - invgrep -R --include="*.py*" -E "from pandas.core import common" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - invgrep -R --include="*.py*" -E "from collections.abc import" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - invgrep -R --include="*.py*" -E "from numpy import nan" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # Checks for test suite - # Check for imports from pandas._testing instead of `import pandas._testing as tm` - invgrep -R --include="*.py*" -E "from pandas._testing import" pandas/tests - RET=$(($RET + $?)) ; echo $MSG "DONE" - invgrep -R --include="*.py*" -E "from pandas import _testing as tm" pandas/tests - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # No direct imports from conftest - invgrep -R --include="*.py*" -E "conftest import" pandas/tests - RET=$(($RET + $?)) ; echo $MSG "DONE" - invgrep -R --include="*.py*" -E "import conftest" pandas/tests - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for use of exec' ; echo $MSG - invgrep -R --include="*.py*" -E "[^a-zA-Z0-9_]exec\(" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for pytest warns' ; echo $MSG - invgrep -r -E --include '*.py' 'pytest\.warns' pandas/tests/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for pytest raises without context' ; echo $MSG - invgrep -r -E --include '*.py' "[[:space:]] pytest.raises" pandas/tests/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for python2-style file encodings' ; echo $MSG - invgrep -R --include="*.py" --include="*.pyx" -E "# -\*- coding: utf-8 -\*-" pandas scripts - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for python2-style super usage' ; echo $MSG - invgrep -R --include="*.py" -E "super\(\w*, (self|cls)\)" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # Check for the following code in testing: `np.testing` and `np.array_equal` - MSG='Check for invalid testing' ; echo $MSG - invgrep -r -E --include '*.py' --exclude testing.py '(numpy|np)(\.testing|\.array_equal)' pandas/tests/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # Check for the following code in the extension array base tests: `tm.assert_frame_equal` and `tm.assert_series_equal` - MSG='Check for invalid EA testing' ; echo $MSG - invgrep -r -E --include '*.py' --exclude base.py 'tm.assert_(series|frame)_equal' pandas/tests/extension/base - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for deprecated messages without sphinx directive' ; echo $MSG - invgrep -R --include="*.py" --include="*.pyx" -E "(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.)" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for python2 new-style classes and for empty parentheses' ; echo $MSG - invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas asv_bench/benchmarks scripts - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for backticks incorrectly rendering because of missing spaces' ; echo $MSG - invgrep -R --include="*.rst" -E "[a-zA-Z0-9]\`\`?[a-zA-Z0-9]" doc/source/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for incorrect sphinx directives' ; echo $MSG - invgrep -R --include="*.py" --include="*.pyx" --include="*.rst" -E "\.\. (autosummary|contents|currentmodule|deprecated|function|image|important|include|ipython|literalinclude|math|module|note|raw|seealso|toctree|versionadded|versionchanged|warning):[^:]" ./pandas ./doc/source - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # Check for the following code in testing: `unittest.mock`, `mock.Mock()` or `mock.patch` - MSG='Check that unittest.mock is not used (pytest builtin monkeypatch fixture should be used instead)' ; echo $MSG - invgrep -r -E --include '*.py' '(unittest(\.| import )mock|mock\.Mock\(\)|mock\.patch)' pandas/tests/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for wrong space after code-block directive and before colon (".. code-block ::" instead of ".. code-block::")' ; echo $MSG - invgrep -R --include="*.rst" ".. code-block ::" doc/source - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for wrong space after ipython directive and before colon (".. ipython ::" instead of ".. ipython::")' ; echo $MSG - invgrep -R --include="*.rst" ".. ipython ::" doc/source - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for extra blank lines after the class definition' ; echo $MSG - invgrep -R --include="*.py" --include="*.pyx" -E 'class.*:\n\n( )+"""' . - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for use of {foo!r} instead of {repr(foo)}' ; echo $MSG - invgrep -R --include=*.{py,pyx} '!r}' pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for use of comment-based annotation syntax' ; echo $MSG - invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for use of foo.__class__ instead of type(foo)' ; echo $MSG - invgrep -R --include=*.{py,pyx} '\.__class__' pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for use of xrange instead of range' ; echo $MSG - invgrep -R --include=*.{py,pyx} 'xrange' pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check that no file in the repo contains trailing whitespaces' ; echo $MSG - INVGREP_APPEND=" <- trailing whitespaces found" - invgrep -RI --exclude=\*.{svg,c,cpp,html,js} --exclude-dir=env "\s$" * - RET=$(($RET + $?)) ; echo $MSG "DONE" - unset INVGREP_APPEND fi ### CODE ### @@ -254,8 +47,8 @@ import sys import pandas blocklist = {'bs4', 'gcsfs', 'html5lib', 'http', 'ipython', 'jinja2', 'hypothesis', - 'lxml', 'matplotlib', 'numexpr', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy', - 'tables', 'urllib.request', 'xlrd', 'xlsxwriter', 'xlwt'} + 'lxml', 'matplotlib', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy', + 'tables', 'urllib.request', 'xlrd', 'xlsxwriter'} # GH#28227 for some of these check for top-level modules, while others are # more specific (e.g. urllib.request) @@ -272,80 +65,13 @@ fi ### DOCTESTS ### if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then - # Individual files - - MSG='Doctests accessor.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/accessor.py - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests aggregation.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/aggregation.py + MSG='Doctests' ; echo $MSG + # Ignore test_*.py files or else the unit tests will run + python -m pytest --doctest-modules --ignore-glob="**/test_*.py" pandas RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests base.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/base.py - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests construction.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/construction.py - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests frame.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/frame.py - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests generic.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/generic.py - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests series.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/series.py - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests strings.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/strings.py - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # Directories - - MSG='Doctests arrays'; echo $MSG - pytest -q --doctest-modules pandas/core/arrays/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests computation' ; echo $MSG - pytest -q --doctest-modules pandas/core/computation/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests dtypes'; echo $MSG - pytest -q --doctest-modules pandas/core/dtypes/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests groupby' ; echo $MSG - pytest -q --doctest-modules pandas/core/groupby/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests indexes' ; echo $MSG - pytest -q --doctest-modules pandas/core/indexes/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests ops' ; echo $MSG - pytest -q --doctest-modules pandas/core/ops/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests reshape' ; echo $MSG - pytest -q --doctest-modules pandas/core/reshape/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests tools' ; echo $MSG - pytest -q --doctest-modules pandas/core/tools/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests window' ; echo $MSG - pytest -q --doctest-modules pandas/core/window/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests tseries' ; echo $MSG - pytest -q --doctest-modules pandas/tseries/ + MSG='Cython Doctests' ; echo $MSG + python -m pytest --doctest-cython pandas/_libs RET=$(($RET + $?)) ; echo $MSG "DONE" fi @@ -353,35 +79,539 @@ fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA02, SA03)' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA02,SA03 - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Validate correct capitalization among titles in documentation' ; echo $MSG - $BASE_DIR/scripts/validate_rst_title_capitalization.py $BASE_DIR/doc/source/development $BASE_DIR/doc/source/reference + MSG='Validate docstrings (EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT02, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05, SS06)' ; echo $MSG + $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT02,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06 + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Partially validate docstrings (EX01)' ; echo $MSG + $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01 --ignore_functions \ + pandas.Series.index \ + pandas.Series.nbytes \ + pandas.Series.ndim \ + pandas.Series.size \ + pandas.Series.T \ + pandas.Series.hasnans \ + pandas.Series.to_list \ + pandas.Series.__iter__ \ + pandas.Series.keys \ + pandas.Series.item \ + pandas.Series.pipe \ + pandas.Series.kurt \ + pandas.Series.mean \ + pandas.Series.median \ + pandas.Series.mode \ + pandas.Series.sem \ + pandas.Series.skew \ + pandas.Series.kurtosis \ + pandas.Series.is_unique \ + pandas.Series.is_monotonic_increasing \ + pandas.Series.is_monotonic_decreasing \ + pandas.Series.backfill \ + pandas.Series.pad \ + pandas.Series.argsort \ + pandas.Series.reorder_levels \ + pandas.Series.ravel \ + pandas.Series.first_valid_index \ + pandas.Series.last_valid_index \ + pandas.Series.dt.date \ + pandas.Series.dt.time \ + pandas.Series.dt.timetz \ + pandas.Series.dt.dayofyear \ + pandas.Series.dt.day_of_year \ + pandas.Series.dt.quarter \ + pandas.Series.dt.daysinmonth \ + pandas.Series.dt.days_in_month \ + pandas.Series.dt.tz \ + pandas.Series.dt.end_time \ + pandas.Series.dt.days \ + pandas.Series.dt.seconds \ + pandas.Series.dt.microseconds \ + pandas.Series.dt.nanoseconds \ + pandas.Series.str.center \ + pandas.Series.str.decode \ + pandas.Series.str.encode \ + pandas.Series.str.find \ + pandas.Series.str.fullmatch \ + pandas.Series.str.index \ + pandas.Series.str.ljust \ + pandas.Series.str.match \ + pandas.Series.str.normalize \ + pandas.Series.str.rfind \ + pandas.Series.str.rindex \ + pandas.Series.str.rjust \ + pandas.Series.str.translate \ + pandas.Series.sparse \ + pandas.DataFrame.sparse \ + pandas.Series.cat.categories \ + pandas.Series.cat.ordered \ + pandas.Series.cat.codes \ + pandas.Series.cat.reorder_categories \ + pandas.Series.cat.set_categories \ + pandas.Series.cat.as_ordered \ + pandas.Series.cat.as_unordered \ + pandas.Series.sparse.fill_value \ + pandas.Flags \ + pandas.Series.attrs \ + pandas.Series.plot \ + pandas.Series.hist \ + pandas.Series.to_string \ + pandas.errors.AbstractMethodError \ + pandas.errors.AccessorRegistrationWarning \ + pandas.errors.AttributeConflictWarning \ + pandas.errors.DataError \ + pandas.errors.EmptyDataError \ + pandas.errors.IncompatibilityWarning \ + pandas.errors.InvalidComparison \ + pandas.errors.InvalidIndexError \ + pandas.errors.InvalidVersion \ + pandas.errors.IntCastingNaNError \ + pandas.errors.LossySetitemError \ + pandas.errors.MergeError \ + pandas.errors.NoBufferPresent \ + pandas.errors.NullFrequencyError \ + pandas.errors.NumbaUtilError \ + pandas.errors.OptionError \ + pandas.errors.OutOfBoundsDatetime \ + pandas.errors.OutOfBoundsTimedelta \ + pandas.errors.ParserError \ + pandas.errors.PerformanceWarning \ + pandas.errors.PyperclipException \ + pandas.errors.PyperclipWindowsException \ + pandas.errors.UnsortedIndexError \ + pandas.errors.UnsupportedFunctionCall \ + pandas.show_versions \ + pandas.test \ + pandas.NaT \ + pandas.Timestamp.as_unit \ + pandas.Timestamp.ctime \ + pandas.Timestamp.date \ + pandas.Timestamp.dst \ + pandas.Timestamp.isocalendar \ + pandas.Timestamp.isoweekday \ + pandas.Timestamp.strptime \ + pandas.Timestamp.time \ + pandas.Timestamp.timetuple \ + pandas.Timestamp.timetz \ + pandas.Timestamp.to_datetime64 \ + pandas.Timestamp.toordinal \ + pandas.Timestamp.tzname \ + pandas.Timestamp.utcoffset \ + pandas.Timestamp.utctimetuple \ + pandas.Timestamp.weekday \ + pandas.arrays.DatetimeArray \ + pandas.Timedelta.view \ + pandas.Timedelta.as_unit \ + pandas.Timedelta.ceil \ + pandas.Timedelta.floor \ + pandas.Timedelta.round \ + pandas.Timedelta.to_pytimedelta \ + pandas.Timedelta.to_timedelta64 \ + pandas.Timedelta.to_numpy \ + pandas.Timedelta.total_seconds \ + pandas.arrays.TimedeltaArray \ + pandas.Period.end_time \ + pandas.Period.freqstr \ + pandas.Period.is_leap_year \ + pandas.Period.month \ + pandas.Period.quarter \ + pandas.Period.year \ + pandas.Period.asfreq \ + pandas.Period.now \ + pandas.arrays.PeriodArray \ + pandas.Interval.closed \ + pandas.Interval.left \ + pandas.Interval.length \ + pandas.Interval.right \ + pandas.arrays.IntervalArray.left \ + pandas.arrays.IntervalArray.right \ + pandas.arrays.IntervalArray.closed \ + pandas.arrays.IntervalArray.mid \ + pandas.arrays.IntervalArray.length \ + pandas.arrays.IntervalArray.is_non_overlapping_monotonic \ + pandas.arrays.IntervalArray.from_arrays \ + pandas.arrays.IntervalArray.to_tuples \ + pandas.Int8Dtype \ + pandas.Int16Dtype \ + pandas.Int32Dtype \ + pandas.Int64Dtype \ + pandas.UInt8Dtype \ + pandas.UInt16Dtype \ + pandas.UInt32Dtype \ + pandas.UInt64Dtype \ + pandas.NA \ + pandas.Float32Dtype \ + pandas.Float64Dtype \ + pandas.CategoricalDtype.categories \ + pandas.CategoricalDtype.ordered \ + pandas.Categorical.dtype \ + pandas.Categorical.categories \ + pandas.Categorical.ordered \ + pandas.Categorical.codes \ + pandas.Categorical.__array__ \ + pandas.SparseDtype \ + pandas.DatetimeTZDtype.unit \ + pandas.DatetimeTZDtype.tz \ + pandas.PeriodDtype.freq \ + pandas.IntervalDtype.subtype \ + pandas_dtype \ + pandas.api.types.is_bool \ + pandas.api.types.is_complex \ + pandas.api.types.is_float \ + pandas.api.types.is_integer \ + pandas.api.types.pandas_dtype \ + pandas.read_clipboard \ + pandas.ExcelFile \ + pandas.ExcelFile.parse \ + pandas.DataFrame.to_html \ + pandas.io.formats.style.Styler.to_html \ + pandas.HDFStore.put \ + pandas.HDFStore.append \ + pandas.HDFStore.get \ + pandas.HDFStore.select \ + pandas.HDFStore.info \ + pandas.HDFStore.keys \ + pandas.HDFStore.groups \ + pandas.HDFStore.walk \ + pandas.read_feather \ + pandas.DataFrame.to_feather \ + pandas.read_parquet \ + pandas.read_orc \ + pandas.read_sas \ + pandas.read_spss \ + pandas.read_sql_query \ + pandas.read_gbq \ + pandas.io.stata.StataReader.data_label \ + pandas.io.stata.StataReader.value_labels \ + pandas.io.stata.StataReader.variable_labels \ + pandas.io.stata.StataWriter.write_file \ + pandas.core.resample.Resampler.__iter__ \ + pandas.core.resample.Resampler.groups \ + pandas.core.resample.Resampler.indices \ + pandas.core.resample.Resampler.get_group \ + pandas.core.resample.Resampler.ffill \ + pandas.core.resample.Resampler.asfreq \ + pandas.core.resample.Resampler.count \ + pandas.core.resample.Resampler.nunique \ + pandas.core.resample.Resampler.max \ + pandas.core.resample.Resampler.mean \ + pandas.core.resample.Resampler.median \ + pandas.core.resample.Resampler.min \ + pandas.core.resample.Resampler.ohlc \ + pandas.core.resample.Resampler.prod \ + pandas.core.resample.Resampler.size \ + pandas.core.resample.Resampler.sem \ + pandas.core.resample.Resampler.std \ + pandas.core.resample.Resampler.sum \ + pandas.core.resample.Resampler.var \ + pandas.core.resample.Resampler.quantile \ + pandas.describe_option \ + pandas.reset_option \ + pandas.get_option \ + pandas.set_option \ + pandas.plotting.deregister_matplotlib_converters \ + pandas.plotting.plot_params \ + pandas.plotting.register_matplotlib_converters \ + pandas.plotting.table \ + pandas.util.hash_array \ + pandas.util.hash_pandas_object \ + pandas_object \ + pandas.api.interchange.from_dataframe \ + pandas.Index.values \ + pandas.Index.hasnans \ + pandas.Index.dtype \ + pandas.Index.inferred_type \ + pandas.Index.shape \ + pandas.Index.name \ + pandas.Index.nbytes \ + pandas.Index.ndim \ + pandas.Index.size \ + pandas.Index.T \ + pandas.Index.memory_usage \ + pandas.Index.copy \ + pandas.Index.drop \ + pandas.Index.identical \ + pandas.Index.insert \ + pandas.Index.is_ \ + pandas.Index.take \ + pandas.Index.putmask \ + pandas.Index.unique \ + pandas.Index.fillna \ + pandas.Index.dropna \ + pandas.Index.astype \ + pandas.Index.item \ + pandas.Index.map \ + pandas.Index.ravel \ + pandas.Index.to_list \ + pandas.Index.append \ + pandas.Index.join \ + pandas.Index.asof_locs \ + pandas.Index.get_slice_bound \ + pandas.RangeIndex \ + pandas.RangeIndex.start \ + pandas.RangeIndex.stop \ + pandas.RangeIndex.step \ + pandas.RangeIndex.from_range \ + pandas.CategoricalIndex.codes \ + pandas.CategoricalIndex.categories \ + pandas.CategoricalIndex.ordered \ + pandas.CategoricalIndex.reorder_categories \ + pandas.CategoricalIndex.set_categories \ + pandas.CategoricalIndex.as_ordered \ + pandas.CategoricalIndex.as_unordered \ + pandas.CategoricalIndex.equals \ + pandas.IntervalIndex.closed \ + pandas.IntervalIndex.values \ + pandas.IntervalIndex.is_non_overlapping_monotonic \ + pandas.IntervalIndex.to_tuples \ + pandas.MultiIndex.dtypes \ + pandas.MultiIndex.drop \ + pandas.DatetimeIndex \ + pandas.DatetimeIndex.date \ + pandas.DatetimeIndex.time \ + pandas.DatetimeIndex.timetz \ + pandas.DatetimeIndex.dayofyear \ + pandas.DatetimeIndex.day_of_year \ + pandas.DatetimeIndex.quarter \ + pandas.DatetimeIndex.tz \ + pandas.DatetimeIndex.freqstr \ + pandas.DatetimeIndex.inferred_freq \ + pandas.DatetimeIndex.indexer_at_time \ + pandas.DatetimeIndex.indexer_between_time \ + pandas.DatetimeIndex.snap \ + pandas.DatetimeIndex.as_unit \ + pandas.DatetimeIndex.to_pydatetime \ + pandas.DatetimeIndex.to_series \ + pandas.DatetimeIndex.mean \ + pandas.DatetimeIndex.std \ + pandas.TimedeltaIndex \ + pandas.TimedeltaIndex.days \ + pandas.TimedeltaIndex.seconds \ + pandas.TimedeltaIndex.microseconds \ + pandas.TimedeltaIndex.nanoseconds \ + pandas.TimedeltaIndex.components \ + pandas.TimedeltaIndex.inferred_freq \ + pandas.TimedeltaIndex.as_unit \ + pandas.TimedeltaIndex.to_pytimedelta \ + pandas.TimedeltaIndex.mean \ + pandas.PeriodIndex.day \ + pandas.PeriodIndex.dayofweek \ + pandas.PeriodIndex.day_of_week \ + pandas.PeriodIndex.dayofyear \ + pandas.PeriodIndex.day_of_year \ + pandas.PeriodIndex.days_in_month \ + pandas.PeriodIndex.daysinmonth \ + pandas.PeriodIndex.end_time \ + pandas.PeriodIndex.freqstr \ + pandas.PeriodIndex.hour \ + pandas.PeriodIndex.is_leap_year \ + pandas.PeriodIndex.minute \ + pandas.PeriodIndex.month \ + pandas.PeriodIndex.quarter \ + pandas.PeriodIndex.second \ + pandas.PeriodIndex.week \ + pandas.PeriodIndex.weekday \ + pandas.PeriodIndex.weekofyear \ + pandas.PeriodIndex.year \ + pandas.PeriodIndex.to_timestamp \ + pandas.core.window.rolling.Rolling.max \ + pandas.core.window.rolling.Rolling.cov \ + pandas.core.window.rolling.Rolling.skew \ + pandas.core.window.rolling.Rolling.apply \ + pandas.core.window.rolling.Window.mean \ + pandas.core.window.rolling.Window.sum \ + pandas.core.window.rolling.Window.var \ + pandas.core.window.rolling.Window.std \ + pandas.core.window.expanding.Expanding.count \ + pandas.core.window.expanding.Expanding.sum \ + pandas.core.window.expanding.Expanding.mean \ + pandas.core.window.expanding.Expanding.median \ + pandas.core.window.expanding.Expanding.min \ + pandas.core.window.expanding.Expanding.max \ + pandas.core.window.expanding.Expanding.corr \ + pandas.core.window.expanding.Expanding.cov \ + pandas.core.window.expanding.Expanding.skew \ + pandas.core.window.expanding.Expanding.apply \ + pandas.core.window.expanding.Expanding.quantile \ + pandas.core.window.ewm.ExponentialMovingWindow.mean \ + pandas.core.window.ewm.ExponentialMovingWindow.sum \ + pandas.core.window.ewm.ExponentialMovingWindow.std \ + pandas.core.window.ewm.ExponentialMovingWindow.var \ + pandas.core.window.ewm.ExponentialMovingWindow.corr \ + pandas.core.window.ewm.ExponentialMovingWindow.cov \ + pandas.api.indexers.BaseIndexer \ + pandas.api.indexers.VariableOffsetWindowIndexer \ + pandas.core.groupby.DataFrameGroupBy.__iter__ \ + pandas.core.groupby.SeriesGroupBy.__iter__ \ + pandas.core.groupby.DataFrameGroupBy.groups \ + pandas.core.groupby.SeriesGroupBy.groups \ + pandas.core.groupby.DataFrameGroupBy.indices \ + pandas.core.groupby.SeriesGroupBy.indices \ + pandas.core.groupby.DataFrameGroupBy.get_group \ + pandas.core.groupby.SeriesGroupBy.get_group \ + pandas.core.groupby.DataFrameGroupBy.all \ + pandas.core.groupby.DataFrameGroupBy.any \ + pandas.core.groupby.DataFrameGroupBy.bfill \ + pandas.core.groupby.DataFrameGroupBy.count \ + pandas.core.groupby.DataFrameGroupBy.cummax \ + pandas.core.groupby.DataFrameGroupBy.cummin \ + pandas.core.groupby.DataFrameGroupBy.cumprod \ + pandas.core.groupby.DataFrameGroupBy.cumsum \ + pandas.core.groupby.DataFrameGroupBy.diff \ + pandas.core.groupby.DataFrameGroupBy.ffill \ + pandas.core.groupby.DataFrameGroupBy.max \ + pandas.core.groupby.DataFrameGroupBy.median \ + pandas.core.groupby.DataFrameGroupBy.min \ + pandas.core.groupby.DataFrameGroupBy.ohlc \ + pandas.core.groupby.DataFrameGroupBy.pct_change \ + pandas.core.groupby.DataFrameGroupBy.prod \ + pandas.core.groupby.DataFrameGroupBy.sem \ + pandas.core.groupby.DataFrameGroupBy.shift \ + pandas.core.groupby.DataFrameGroupBy.size \ + pandas.core.groupby.DataFrameGroupBy.skew \ + pandas.core.groupby.DataFrameGroupBy.std \ + pandas.core.groupby.DataFrameGroupBy.sum \ + pandas.core.groupby.DataFrameGroupBy.var \ + pandas.core.groupby.SeriesGroupBy.all \ + pandas.core.groupby.SeriesGroupBy.any \ + pandas.core.groupby.SeriesGroupBy.bfill \ + pandas.core.groupby.SeriesGroupBy.count \ + pandas.core.groupby.SeriesGroupBy.cummax \ + pandas.core.groupby.SeriesGroupBy.cummin \ + pandas.core.groupby.SeriesGroupBy.cumprod \ + pandas.core.groupby.SeriesGroupBy.cumsum \ + pandas.core.groupby.SeriesGroupBy.diff \ + pandas.core.groupby.SeriesGroupBy.ffill \ + pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing \ + pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing \ + pandas.core.groupby.SeriesGroupBy.max \ + pandas.core.groupby.SeriesGroupBy.median \ + pandas.core.groupby.SeriesGroupBy.min \ + pandas.core.groupby.SeriesGroupBy.nunique \ + pandas.core.groupby.SeriesGroupBy.ohlc \ + pandas.core.groupby.SeriesGroupBy.pct_change \ + pandas.core.groupby.SeriesGroupBy.prod \ + pandas.core.groupby.SeriesGroupBy.sem \ + pandas.core.groupby.SeriesGroupBy.shift \ + pandas.core.groupby.SeriesGroupBy.size \ + pandas.core.groupby.SeriesGroupBy.skew \ + pandas.core.groupby.SeriesGroupBy.std \ + pandas.core.groupby.SeriesGroupBy.sum \ + pandas.core.groupby.SeriesGroupBy.var \ + pandas.core.groupby.SeriesGroupBy.hist \ + pandas.core.groupby.DataFrameGroupBy.plot \ + pandas.core.groupby.SeriesGroupBy.plot \ + pandas.io.formats.style.Styler \ + pandas.io.formats.style.Styler.from_custom_template \ + pandas.io.formats.style.Styler.set_caption \ + pandas.io.formats.style.Styler.set_sticky \ + pandas.io.formats.style.Styler.set_uuid \ + pandas.io.formats.style.Styler.clear \ + pandas.io.formats.style.Styler.highlight_null \ + pandas.io.formats.style.Styler.highlight_max \ + pandas.io.formats.style.Styler.highlight_min \ + pandas.io.formats.style.Styler.bar \ + pandas.io.formats.style.Styler.to_string \ + pandas.api.extensions.ExtensionDtype \ + pandas.api.extensions.ExtensionArray \ + pandas.arrays.PandasArray \ + pandas.api.extensions.ExtensionArray._accumulate \ + pandas.api.extensions.ExtensionArray._concat_same_type \ + pandas.api.extensions.ExtensionArray._formatter \ + pandas.api.extensions.ExtensionArray._from_factorized \ + pandas.api.extensions.ExtensionArray._from_sequence \ + pandas.api.extensions.ExtensionArray._from_sequence_of_strings \ + pandas.api.extensions.ExtensionArray._reduce \ + pandas.api.extensions.ExtensionArray._values_for_argsort \ + pandas.api.extensions.ExtensionArray._values_for_factorize \ + pandas.api.extensions.ExtensionArray.argsort \ + pandas.api.extensions.ExtensionArray.astype \ + pandas.api.extensions.ExtensionArray.copy \ + pandas.api.extensions.ExtensionArray.view \ + pandas.api.extensions.ExtensionArray.dropna \ + pandas.api.extensions.ExtensionArray.equals \ + pandas.api.extensions.ExtensionArray.factorize \ + pandas.api.extensions.ExtensionArray.fillna \ + pandas.api.extensions.ExtensionArray.insert \ + pandas.api.extensions.ExtensionArray.isin \ + pandas.api.extensions.ExtensionArray.isna \ + pandas.api.extensions.ExtensionArray.ravel \ + pandas.api.extensions.ExtensionArray.searchsorted \ + pandas.api.extensions.ExtensionArray.shift \ + pandas.api.extensions.ExtensionArray.unique \ + pandas.api.extensions.ExtensionArray.dtype \ + pandas.api.extensions.ExtensionArray.nbytes \ + pandas.api.extensions.ExtensionArray.ndim \ + pandas.api.extensions.ExtensionArray.shape \ + pandas.api.extensions.ExtensionArray.tolist \ + pandas.DataFrame.index \ + pandas.DataFrame.columns \ + pandas.DataFrame.__iter__ \ + pandas.DataFrame.keys \ + pandas.DataFrame.iterrows \ + pandas.DataFrame.pipe \ + pandas.DataFrame.kurt \ + pandas.DataFrame.kurtosis \ + pandas.DataFrame.mean \ + pandas.DataFrame.median \ + pandas.DataFrame.sem \ + pandas.DataFrame.skew \ + pandas.DataFrame.backfill \ + pandas.DataFrame.pad \ + pandas.DataFrame.swapaxes \ + pandas.DataFrame.first_valid_index \ + pandas.DataFrame.last_valid_index \ + pandas.DataFrame.attrs \ + pandas.DataFrame.plot \ + pandas.DataFrame.sparse.density \ + pandas.DataFrame.sparse.to_coo \ + pandas.DataFrame.to_gbq \ + pandas.DataFrame.style \ + pandas.DataFrame.__dataframe__ + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Partially validate docstrings (EX02)' ; echo $MSG + $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX02 --ignore_functions \ + pandas.DataFrame.plot.line \ + pandas.Series.plot.line \ + pandas.api.types.infer_dtype \ + pandas.api.types.is_datetime64_any_dtype \ + pandas.api.types.is_datetime64_ns_dtype \ + pandas.api.types.is_datetime64tz_dtype \ + pandas.api.types.is_integer_dtype \ + pandas.api.types.is_interval_dtype \ + pandas.api.types.is_period_dtype \ + pandas.api.types.is_signed_integer_dtype \ + pandas.api.types.is_sparse \ + pandas.api.types.is_string_dtype \ + pandas.api.types.is_unsigned_integer_dtype \ + pandas.io.formats.style.Styler.concat \ + pandas.io.formats.style.Styler.export \ + pandas.io.formats.style.Styler.set_td_classes \ + pandas.io.formats.style.Styler.use \ + pandas.plotting.andrews_curves \ + pandas.plotting.autocorrelation_plot \ + pandas.plotting.lag_plot \ + pandas.plotting.parallel_coordinates \ + pandas.plotting.radviz \ + pandas.tseries.frequencies.to_offset RET=$(($RET + $?)) ; echo $MSG "DONE" fi -### DEPENDENCIES ### -if [[ -z "$CHECK" || "$CHECK" == "dependencies" ]]; then +### DOCUMENTATION NOTEBOOKS ### +if [[ -z "$CHECK" || "$CHECK" == "notebooks" ]]; then - MSG='Check that requirements-dev.txt has been generated from environment.yml' ; echo $MSG - $BASE_DIR/scripts/generate_pip_deps_from_conda.py --compare --azure + MSG='Notebooks' ; echo $MSG + jupyter nbconvert --execute $(find doc/source -name '*.ipynb') --to notebook RET=$(($RET + $?)) ; echo $MSG "DONE" fi -### TYPING ### -if [[ -z "$CHECK" || "$CHECK" == "typing" ]]; then - - echo "mypy --version" - mypy --version - - MSG='Performing static analysis using mypy' ; echo $MSG - mypy pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" +### SINGLE-PAGE DOCS ### +if [[ -z "$CHECK" || "$CHECK" == "single-docs" ]]; then + python doc/make.py --warnings-are-errors --single pandas.Series.value_counts + python doc/make.py --warnings-are-errors --single pandas.Series.str.split + python doc/make.py clean fi - exit $RET diff --git a/ci/condarc.yml b/ci/condarc.yml new file mode 100644 index 0000000000000..9d750b7102c39 --- /dev/null +++ b/ci/condarc.yml @@ -0,0 +1,32 @@ +# https://docs.conda.io/projects/conda/en/latest/configuration.html + +# always_yes (NoneType, bool) +# aliases: yes +# Automatically choose the 'yes' option whenever asked to proceed with a +# conda operation, such as when running `conda install`. +# +always_yes: true + +# remote_connect_timeout_secs (float) +# The number seconds conda will wait for your client to establish a +# connection to a remote url resource. +# +remote_connect_timeout_secs: 30.0 + +# remote_max_retries (int) +# The maximum number of retries each HTTP connection should attempt. +# +remote_max_retries: 10 + +# remote_backoff_factor (int) +# The factor determines the time HTTP connection should wait for +# attempt. +# +remote_backoff_factor: 3 + +# remote_read_timeout_secs (float) +# Once conda has connected to a remote resource and sent an HTTP +# request, the read timeout is the number of seconds conda will wait for +# the server to send a response. +# +remote_read_timeout_secs: 60.0 diff --git a/ci/deps/actions-310-numpydev.yaml b/ci/deps/actions-310-numpydev.yaml new file mode 100644 index 0000000000000..ce9d656adc16a --- /dev/null +++ b/ci/deps/actions-310-numpydev.yaml @@ -0,0 +1,28 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.10 + + # build dependencies + - versioneer[toml] + + # test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-xdist>=2.2.0 + - hypothesis>=6.34.2 + - pytest-asyncio>=0.17 + + # pandas dependencies + - python-dateutil + - pytz + - pip + + - pip: + - "cython" + - "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple" + - "--pre" + - "numpy" + - "scipy" + - "tzdata>=2022.1" diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml new file mode 100644 index 0000000000000..5b9919d8e4c1f --- /dev/null +++ b/ci/deps/actions-310.yaml @@ -0,0 +1,58 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.10 + + # build dependencies + - versioneer[toml] + - cython>=0.29.33 + + # test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-xdist>=2.2.0 + - pytest-asyncio>=0.17 + - boto3 + + # required dependencies + - python-dateutil + - numpy + - pytz + + # optional dependencies + - beautifulsoup4 + - blosc + - bottleneck + - brotlipy + - fastparquet + - fsspec + - html5lib + - hypothesis + - gcsfs + - jinja2 + - lxml + - matplotlib>=3.6.1, <3.7.0 + - numba + - numexpr + - openpyxl<3.1.1 + - odfpy + - pandas-gbq + - psycopg2 + - pymysql + - pytables + - pyarrow>=7.0.0 + - pyreadstat + - python-snappy + - pyxlsb + - s3fs>=2021.08.0 + - scipy + - sqlalchemy + - tabulate + - xarray + - xlrd + - xlsxwriter + - zstandard + + - pip: + - tzdata>=2022.1 diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml new file mode 100644 index 0000000000000..77e4fc9d2c2d9 --- /dev/null +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -0,0 +1,29 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.11 + + # build dependencies + - versioneer[toml] + - cython>=0.29.33 + + # test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-xdist>=2.2.0 + - hypothesis>=6.34.2 + - pytest-asyncio>=0.17.0 + + # required dependencies + - python-dateutil + - numpy + - pytz + - pip + + - pip: + - "tzdata>=2022.1" + - "--extra-index-url https://pypi.fury.io/arrow-nightlies/" + - "--prefer-binary" + - "--pre" + - "pyarrow" diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml new file mode 100644 index 0000000000000..ed01238216e9e --- /dev/null +++ b/ci/deps/actions-311.yaml @@ -0,0 +1,58 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.11 + + # build dependencies + - versioneer[toml] + - cython>=0.29.33 + + # test dependencies + - pytest>=7.0 + - pytest-cov + - pytest-xdist>=2.2.0 + - pytest-asyncio>=0.17 + - boto3 + + # required dependencies + - python-dateutil + - numpy + - pytz + + # optional dependencies + - beautifulsoup4 + - blosc + - bottleneck + - brotlipy + - fastparquet + - fsspec + - html5lib + - hypothesis + - gcsfs + - jinja2 + - lxml + - matplotlib>=3.6.1, <3.7.0 + # - numba not compatible with 3.11 + - numexpr + - openpyxl<3.1.1 + - odfpy + - pandas-gbq + - psycopg2 + - pymysql + # - pytables>=3.8.0 # first version that supports 3.11 + - pyarrow>=7.0.0 + - pyreadstat + - python-snappy + - pyxlsb + - s3fs>=2021.08.0 + - scipy + - sqlalchemy + - tabulate + - xarray + - xlrd + - xlsxwriter + - zstandard + + - pip: + - tzdata>=2022.1 diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml new file mode 100644 index 0000000000000..3c498663c04df --- /dev/null +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -0,0 +1,73 @@ +# Non-dependencies that pandas utilizes or has compatibility with pandas objects +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.8 + + # build dependencies + - versioneer[toml] + - cython>=0.29.33 + + # test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-xdist>=2.2.0 + - pytest-asyncio>=0.17 + - boto3 + + # required dependencies + - python-dateutil + - numpy + - pytz + + # optional dependencies + - beautifulsoup4 + - blosc + - brotlipy + - bottleneck + - fastparquet + - fsspec + - html5lib + - hypothesis + - gcsfs + - jinja2 + - lxml + - matplotlib>=3.6.1, <3.7.0 + - numba + - numexpr + - openpyxl<3.1.1 + - odfpy + - psycopg2 + - pyarrow>=7.0.0 + - pymysql + - pyreadstat + - pytables + - python-snappy + - pyxlsb + - s3fs>=2021.08.0 + - scipy + - sqlalchemy + - tabulate + - xarray + - xlrd + - xlsxwriter + - zstandard + + # downstream packages + - botocore + - cftime + - dask + - ipython + - geopandas-base + - seaborn + - scikit-learn + - statsmodels + - coverage + - pandas-datareader + - pandas-gbq + - pyyaml + - py + + - pip: + - tzdata>=2022.1 diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml new file mode 100644 index 0000000000000..194b59b5abcf2 --- /dev/null +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -0,0 +1,62 @@ +# Minimum version of required + optional dependencies +# Aligned with getting_started/install.rst and compat/_optional.py +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.8.0 + + # build dependencies + - versioneer[toml] + - cython>=0.29.33 + + # test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-xdist>=2.2.0 + - pytest-asyncio>=0.17 + - boto3 + + # required dependencies + - python-dateutil=2.8.2 + - numpy=1.20.3 + - pytz=2020.1 + + # optional dependencies + - beautifulsoup4=4.9.3 + - blosc=1.21.0 + - bottleneck=1.3.2 + - brotlipy=0.7.0 + - fastparquet=0.6.3 + - fsspec=2021.07.0 + - html5lib=1.1 + - hypothesis=6.34.2 + - gcsfs=2021.07.0 + - jinja2=3.0.0 + - lxml=4.6.3 + - matplotlib=3.6.1 + - numba=0.53.1 + - numexpr=2.7.3 + - odfpy=1.4.1 + - qtpy=2.2.0 + - openpyxl=3.0.7 + - pandas-gbq=0.15.0 + - psycopg2=2.8.6 + - pyarrow=7.0.0 + - pymysql=1.0.2 + - pyreadstat=1.1.2 + - pytables=3.6.1 + - python-snappy=0.6.0 + - pyxlsb=1.0.8 + - s3fs=2021.08.0 + - scipy=1.7.1 + - sqlalchemy=1.4.16 + - tabulate=0.8.9 + - xarray=0.21.0 + - xlrd=2.0.1 + - xlsxwriter=1.4.3 + - zstandard=0.15.2 + + - pip: + - pyqt5==5.15.1 + - tzdata==2022.1 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml new file mode 100644 index 0000000000000..2a968f059952e --- /dev/null +++ b/ci/deps/actions-38.yaml @@ -0,0 +1,58 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.8 + + # build dependencies + - versioneer[toml] + - cython>=0.29.33 + + # test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-xdist>=2.2.0 + - pytest-asyncio>=0.17 + - boto3 + + # required dependencies + - python-dateutil + - numpy + - pytz + + # optional dependencies + - beautifulsoup4 + - blosc + - bottleneck + - brotlipy + - fastparquet + - fsspec + - html5lib + - hypothesis + - gcsfs + - jinja2 + - lxml + - matplotlib>=3.6.1, <3.7.0 + - numba + - numexpr + - openpyxl<3.1.1 + - odfpy + - pandas-gbq + - psycopg2 + - pyarrow>=7.0.0 + - pymysql + - pyreadstat + - pytables + - python-snappy + - pyxlsb + - s3fs>=2021.08.0 + - scipy + - sqlalchemy + - tabulate + - xarray + - xlrd + - xlsxwriter + - zstandard + + - pip: + - tzdata>=2022.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml new file mode 100644 index 0000000000000..a1fba778bfc70 --- /dev/null +++ b/ci/deps/actions-39.yaml @@ -0,0 +1,58 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.9 + + # build dependencies + - versioneer[toml] + - cython>=0.29.33 + + # test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-xdist>=2.2.0 + - pytest-asyncio>=0.17 + - boto3 + + # required dependencies + - python-dateutil + - numpy + - pytz + + # optional dependencies + - beautifulsoup4 + - blosc + - bottleneck + - brotlipy + - fastparquet + - fsspec + - html5lib + - hypothesis + - gcsfs + - jinja2 + - lxml + - matplotlib>=3.6.1, <3.7.0 + - numba + - numexpr + - openpyxl<3.1.1 + - odfpy + - pandas-gbq + - psycopg2 + - pymysql + - pyarrow>=7.0.0 + - pyreadstat + - pytables + - python-snappy + - pyxlsb + - s3fs>=2021.08.0 + - scipy + - sqlalchemy + - tabulate + - xarray + - xlrd + - xlsxwriter + - zstandard + + - pip: + - tzdata>=2022.1 diff --git a/ci/deps/actions-pypy-38.yaml b/ci/deps/actions-pypy-38.yaml new file mode 100644 index 0000000000000..78ab32d6e7830 --- /dev/null +++ b/ci/deps/actions-pypy-38.yaml @@ -0,0 +1,27 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + # TODO: Add the rest of the dependencies in here + # once the other plentiful failures/segfaults + # with base pandas has been dealt with + - python=3.8[build=*_pypy] # TODO: use this once pypy3.8 is available + + # build dependencies + - versioneer[toml] + - cython>=0.29.33 + + # test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-asyncio + - pytest-xdist>=2.2.0 + - hypothesis>=6.34.2 + + # required + - numpy + - python-dateutil + - pytz + + - pip: + - tzdata>=2022.1 diff --git a/ci/deps/azure-36-32bit.yaml b/ci/deps/azure-36-32bit.yaml deleted file mode 100644 index 2dc53f8181ac4..0000000000000 --- a/ci/deps/azure-36-32bit.yaml +++ /dev/null @@ -1,26 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.6.* - - # tools - ### Cython 0.29.13 and pytest 5.0.1 for 32 bits are not available with conda, installing below with pip instead - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - pytest-azurepipelines - - # pandas dependencies - - attrs=19.1.0 - - gcc_linux-32 - - gxx_linux-32 - - numpy=1.14.* - - python-dateutil - - pytz=2017.2 - - # see comment above - - pip - - pip: - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml deleted file mode 100644 index d31015fde4741..0000000000000 --- a/ci/deps/azure-36-locale.yaml +++ /dev/null @@ -1,38 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.6.* - - # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 - - pytest-asyncio - - hypothesis>=3.58.0 - - pytest-azurepipelines - - # pandas dependencies - - beautifulsoup4 - - html5lib - - ipython - - jinja2 - - lxml - - matplotlib=3.0.* - - nomkl - - numexpr - - numpy=1.15.* - - openpyxl - # lowest supported version of pyarrow (putting it here instead of in - # azure-36-minimum_versions because it needs numpy >= 1.14) - - pyarrow=0.13 - - pytables - - python-dateutil - - pytz - - scipy - - xarray - - xlrd - - xlsxwriter - - xlwt - - moto diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml deleted file mode 100644 index 23121b985492e..0000000000000 --- a/ci/deps/azure-36-locale_slow.yaml +++ /dev/null @@ -1,32 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.6.* - - # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - pytest-azurepipelines - - # pandas dependencies - - beautifulsoup4=4.6.0 - - bottleneck=1.2.* - - lxml - - matplotlib=2.2.2 - - numpy=1.14.* - - openpyxl=2.5.7 - - python-dateutil - - python-blosc - - pytz=2017.2 - - scipy - - sqlalchemy=1.1.4 - - xlrd=1.1.0 - - xlsxwriter=0.9.8 - - xlwt=1.2.0 - - pip - - pip: - - html5lib==1.0b2 diff --git a/ci/deps/azure-36-minimum_versions.yaml b/ci/deps/azure-36-minimum_versions.yaml deleted file mode 100644 index f5af7bcf36189..0000000000000 --- a/ci/deps/azure-36-minimum_versions.yaml +++ /dev/null @@ -1,30 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.6.1 - - # tools - - cython=0.29.16 - - pytest=5.0.1 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - pytest-azurepipelines - - psutil - - # pandas dependencies - - beautifulsoup4=4.6.0 - - bottleneck=1.2.1 - - jinja2=2.8 - - numba=0.46.0 - - numexpr=2.6.2 - - numpy=1.15.4 - - openpyxl=2.5.7 - - pytables=3.4.3 - - python-dateutil=2.7.3 - - pytz=2017.2 - - scipy=1.2 - - xlrd=1.1.0 - - xlsxwriter=0.9.8 - - xlwt=1.2.0 - - html5lib=1.0.1 diff --git a/ci/deps/azure-36-slow.yaml b/ci/deps/azure-36-slow.yaml deleted file mode 100644 index 0a6d1d13c8549..0000000000000 --- a/ci/deps/azure-36-slow.yaml +++ /dev/null @@ -1,35 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.6.* - - # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - # pandas dependencies - - beautifulsoup4 - - fsspec>=0.7.4 - - html5lib - - lxml - - matplotlib - - numexpr - - numpy - - openpyxl - - patsy - - psycopg2 - - pymysql - - pytables - - python-dateutil - - pytz - - s3fs>=0.4.0 - - scipy - - sqlalchemy - - xlrd - - xlsxwriter - - xlwt - - moto diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml deleted file mode 100644 index 4dbb6a5344976..0000000000000 --- a/ci/deps/azure-37-locale.yaml +++ /dev/null @@ -1,38 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.7.* - - # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 - - pytest-asyncio - - hypothesis>=3.58.0 - - pytest-azurepipelines - - # pandas dependencies - - beautifulsoup4 - - html5lib - - ipython - - jinja2 - - lxml - - matplotlib>=3.3.0 - - moto - - nomkl - - numexpr - - numpy - - openpyxl - - pytables - - python-dateutil - - pytz - - scipy - - xarray - - xlrd - - xlsxwriter - - xlwt - - pyarrow>=0.15 - - pip - - pip: - - pyxlsb diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml deleted file mode 100644 index 451fb5884a4af..0000000000000 --- a/ci/deps/azure-37-numpydev.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: pandas-dev -channels: - - defaults -dependencies: - - python=3.7.* - - # tools - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - pytest-azurepipelines - - # pandas dependencies - - pytz - - pip - - pip: - - cython==0.29.16 # GH#34014 - - "git+git://github.com/dateutil/dateutil.git" - - "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple" - - "--pre" - - "numpy" - - "scipy" diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml deleted file mode 100644 index 81a27465f9e61..0000000000000 --- a/ci/deps/azure-macos-36.yaml +++ /dev/null @@ -1,36 +0,0 @@ -name: pandas-dev -channels: - - defaults -dependencies: - - python=3.6.* - - # tools - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - pytest-azurepipelines - - # pandas dependencies - - beautifulsoup4 - - bottleneck - - html5lib - - jinja2 - - lxml - - matplotlib=2.2.3 - - nomkl - - numexpr - - numpy=1.15.4 - - openpyxl - - pyarrow>=0.13.0 - - pytables - - python-dateutil==2.7.3 - - pytz - - xarray - - xlrd - - xlsxwriter - - xlwt - - pip - - pip: - - cython>=0.29.16 - - pyreadstat - - pyxlsb diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml deleted file mode 100644 index 4d7e1d821037b..0000000000000 --- a/ci/deps/azure-windows-36.yaml +++ /dev/null @@ -1,32 +0,0 @@ -name: pandas-dev -channels: - - conda-forge - - defaults -dependencies: - - python=3.6.* - - # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - pytest-azurepipelines - - # pandas dependencies - - blosc - - bottleneck - - fastparquet>=0.3.2 - - matplotlib=3.0.2 - - numba - - numexpr - - numpy=1.15.* - - openpyxl - - jinja2 - - pyarrow>=0.13.0 - - pytables - - python-dateutil - - pytz - - scipy - - xlrd - - xlsxwriter - - xlwt diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml deleted file mode 100644 index 34fca631df6c1..0000000000000 --- a/ci/deps/azure-windows-37.yaml +++ /dev/null @@ -1,41 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.7.* - - # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - pytest-azurepipelines - - # pandas dependencies - - beautifulsoup4 - - bottleneck - - fsspec>=0.7.4 - - gcsfs>=0.6.0 - - html5lib - - jinja2 - - lxml - - matplotlib=2.2.* - - moto - - numexpr - - numpy=1.18.* - - openpyxl - - pyarrow=0.14 - - pytables - - python-dateutil - - pytz - - s3fs>=0.4.0 - - scipy - - sqlalchemy - - xlrd - - xlsxwriter - - xlwt - - pyreadstat - - pip - - pip: - - pyxlsb diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml new file mode 100644 index 0000000000000..8f309b0781457 --- /dev/null +++ b/ci/deps/circle-38-arm64.yaml @@ -0,0 +1,57 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.8 + + # build dependencies + - versioneer[toml] + - cython>=0.29.33 + + # test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-xdist>=2.2.0 + - pytest-asyncio>=0.17 + - boto3 + + # required dependencies + - python-dateutil + - numpy + - pytz + + # optional dependencies + - beautifulsoup4 + - blosc + - bottleneck + - brotlipy + - fastparquet + - fsspec + - html5lib + - hypothesis + - gcsfs + - jinja2 + - lxml + - matplotlib>=3.6.1, <3.7.0 + # test_numba_vs_cython segfaults with numba 0.57 + - numba>=0.55.2, <0.57.0 + - numexpr + - openpyxl<3.1.1 + - odfpy + - pandas-gbq + - psycopg2 + - pyarrow>=7.0.0 + - pymysql + # Not provided on ARM + #- pyreadstat + - pytables + - python-snappy + - pyxlsb + - s3fs>=2021.08.0 + - scipy + - sqlalchemy + - tabulate + - xarray + - xlrd + - xlsxwriter + - zstandard diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml deleted file mode 100644 index 5f5ea8034cddf..0000000000000 --- a/ci/deps/travis-36-cov.yaml +++ /dev/null @@ -1,54 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.6.* - - # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - pytest-cov # this is only needed in the coverage build - - # pandas dependencies - - beautifulsoup4 - - botocore>=1.11 - - cython>=0.29.16 - - dask - - fastparquet>=0.3.2 - - fsspec>=0.7.4 - - gcsfs>=0.6.0 - - geopandas - - html5lib - - matplotlib - - moto - - nomkl - - numexpr - - numpy=1.15.* - - odfpy - - openpyxl - - pandas-gbq - - psycopg2 - - pyarrow>=0.13.0 - - pymysql - - pytables - - python-snappy - - pytz - - s3fs>=0.4.0 - - scikit-learn - - scipy - - sqlalchemy - - statsmodels - - xarray - - xlrd - - xlsxwriter - - xlwt - - pip - - pip: - - brotlipy - - coverage - - pandas-datareader - - python-dateutil - - pyxlsb diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml deleted file mode 100644 index 6bc4aba733ee5..0000000000000 --- a/ci/deps/travis-36-locale.yaml +++ /dev/null @@ -1,40 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.6.* - - # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - # pandas dependencies - - beautifulsoup4 - - blosc=1.14.3 - - python-blosc - - fastparquet=0.3.2 - - html5lib - - ipython - - jinja2 - - lxml=3.8.0 - - matplotlib=3.0.* - - moto - - nomkl - - numexpr - - numpy - - openpyxl - - pandas-gbq=0.12.0 - - psycopg2=2.6.2 - - pymysql=0.7.11 - - pytables - - python-dateutil - - pytz - - scipy - - sqlalchemy=1.1.4 - - xarray=0.10 - - xlrd - - xlsxwriter - - xlwt diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/travis-37-arm64.yaml deleted file mode 100644 index f434a03609b26..0000000000000 --- a/ci/deps/travis-37-arm64.yaml +++ /dev/null @@ -1,21 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.7.* - - # tools - - cython>=0.29.13 - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - # pandas dependencies - - botocore>=1.11 - - numpy - - python-dateutil - - pytz - - pip - - pip: - - moto diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml deleted file mode 100644 index aaf706d61fe5c..0000000000000 --- a/ci/deps/travis-37.yaml +++ /dev/null @@ -1,27 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.7.* - - # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - # pandas dependencies - - botocore>=1.11 - - fsspec>=0.7.4 - - numpy - - python-dateutil - - nomkl - - pyarrow - - pytz - - s3fs>=0.4.0 - - tabulate - - pyreadstat - - pip - - pip: - - moto diff --git a/ci/deps/travis-38.yaml b/ci/deps/travis-38.yaml deleted file mode 100644 index ac39a223cd086..0000000000000 --- a/ci/deps/travis-38.yaml +++ /dev/null @@ -1,20 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.8.* - - # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - # pandas dependencies - - numpy - - python-dateutil - - nomkl - - pytz - - pip - - tabulate==0.8.3 diff --git a/ci/fix_wheels.py b/ci/fix_wheels.py new file mode 100644 index 0000000000000..76b70fdde9ea0 --- /dev/null +++ b/ci/fix_wheels.py @@ -0,0 +1,61 @@ +""" +This file "repairs" our Windows wheels by copying the necessary DLLs for pandas to run +on a barebones Windows installation() into the wheel. + +NOTE: The paths for the DLLs are hard-coded to the location of the Visual Studio +redistributables +""" +import os +import shutil +import subprocess +from subprocess import CalledProcessError +import sys +import zipfile + +try: + if len(sys.argv) != 3: + raise ValueError( + "User must pass the path to the wheel and the destination directory." + ) + wheel_path = sys.argv[1] + dest_dir = sys.argv[2] + # Figure out whether we are building on 32 or 64 bit python + is_32 = sys.maxsize <= 2**32 + PYTHON_ARCH = "x86" if is_32 else "x64" +except ValueError: + # Too many/little values to unpack + raise ValueError( + "User must pass the path to the wheel and the destination directory." + ) +if not os.path.isdir(dest_dir): + print(f"Created directory {dest_dir}") + os.mkdir(dest_dir) + +wheel_name = os.path.basename(wheel_path) +success = True + +try: + # Use the wheel CLI for zipping up the wheel since the CLI will + # take care of rebuilding the hashes found in the record file + tmp_dir = os.path.join(dest_dir, "tmp") + with zipfile.ZipFile(wheel_path, "r") as f: + # Extracting all the members of the zip + # into a specific location. + f.extractall(path=tmp_dir) + base_redist_dir = ( + f"C:/Program Files (x86)/Microsoft Visual Studio/2019/" + f"Enterprise/VC/Redist/MSVC/14.29.30133/{PYTHON_ARCH}/" + f"Microsoft.VC142.CRT/" + ) + required_dlls = ["msvcp140.dll", "concrt140.dll"] + if not is_32: + required_dlls += ["vcruntime140_1.dll"] + dest_dll_dir = os.path.join(tmp_dir, "pandas/_libs/window") + for dll in required_dlls: + src = os.path.join(base_redist_dir, dll) + shutil.copy(src, dest_dll_dir) + subprocess.run(["wheel", "pack", tmp_dir, "-d", dest_dir], check=True) +except CalledProcessError: + print("Failed to add DLLS to wheel.") + sys.exit(1) +print("Successfully repaired wheel") diff --git a/ci/meta.yaml b/ci/meta.yaml new file mode 100644 index 0000000000000..f02c7eec001fc --- /dev/null +++ b/ci/meta.yaml @@ -0,0 +1,93 @@ +{% set version = "2.0.1" %} + +package: + name: pandas + version: {{ version }} + +source: + git_url: ../.. + +build: + number: 1 + script: + - export PYTHONUNBUFFERED=1 # [ppc64le] + - {{ PYTHON }} -m pip install -vv --no-deps --ignore-installed . # [not unix] + - {{ PYTHON }} -m pip install -vv --no-deps --ignore-installed . --global-option="build_ext" --global-option="-j4" --no-use-pep517 # [unix] + skip: true # [py<39] + +requirements: + build: + - python # [build_platform != target_platform] + - cross-python_{{ target_platform }} # [build_platform != target_platform] + - cython # [build_platform != target_platform] + - numpy # [build_platform != target_platform] + - {{ compiler('c') }} + - {{ compiler('cxx') }} + host: + - python + - pip + - setuptools >=61.0.0 + - cython >=0.29.33,<3 + - numpy >=1.21.6 # [py<311] + - numpy >=1.23.2 # [py>=311] + - versioneer + - tomli # [py<311] + run: + - python + - {{ pin_compatible('numpy') }} + - python-dateutil >=2.8.2 + - pytz >=2020.1 + - python-tzdata >=2022.1 + +test: + imports: + - pandas + commands: + - pip check + # Skip test suite on PyPy as it segfaults there + # xref: https://github.com/conda-forge/pandas-feedstock/issues/148 + # + # Also skip `test_rolling_var_numerical_issues` on `ppc64le` as it is a known test failure. + # xref: https://github.com/conda-forge/pandas-feedstock/issues/149 + {% set markers = ["not clipboard", "not single_cpu", "not db", "not network", "not slow"] %} + {% set markers = markers + ["not arm_slow"] %} # [aarch64 or ppc64le] + {% set extra_args = ["-n=2 -m " + " and ".join(markers)] %} + {% set tests_to_skip = "_not_a_real_test" %} + {% set tests_to_skip = tests_to_skip + " or test_rolling_var_numerical_issues" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or test_std_timedelta64_skipna_false" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or test_value_counts_normalized[M8[ns]]" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or test_to_datetime_format_YYYYMMDD_with_nat" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or (TestReductions and test_median_2d)" %} # [ppc64le] + {% set extra_args = extra_args + ["-k", "not (" + tests_to_skip + ")"] %} + - python -c "import pandas; pandas.test(extra_args={{ extra_args }})" # [python_impl == "cpython"] + requires: + - pip + - pytest >=7.0.0 + - pytest-asyncio >=0.17.0 + - pytest-xdist >=2.2.0 + - pytest-cov + - hypothesis >=6.46.1 + - tomli # [py<311] + +about: + home: http://pandas.pydata.org + license: BSD-3-Clause + license_file: LICENSE + summary: Powerful data structures for data analysis, time series, and statistics + doc_url: https://pandas.pydata.org/docs/ + dev_url: https://github.com/pandas-dev/pandas + +extra: + recipe-maintainers: + - jreback + - jorisvandenbossche + - msarahan + - ocefpaf + - TomAugspurger + - WillAyd + - simonjayhawkins + - mroeschke + - datapythonista + - phofl + - lithomas1 + - marcogorelli diff --git a/ci/prep_cython_cache.sh b/ci/prep_cython_cache.sh deleted file mode 100755 index 18d9388327ddc..0000000000000 --- a/ci/prep_cython_cache.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash - -ls "$HOME/.cache/" - -PYX_CACHE_DIR="$HOME/.cache/pyxfiles" -pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx" -o -name "*.pxd" -o -name "*.pxi.in"` -pyx_cache_file_list=`find ${PYX_CACHE_DIR} -name "*.pyx" -o -name "*.pxd" -o -name "*.pxi.in"` - -CACHE_File="$HOME/.cache/cython_files.tar" - -# Clear the cython cache 0 = NO, 1 = YES -clear_cache=0 - -pyx_files=`echo "$pyx_file_list" | wc -l` -pyx_cache_files=`echo "$pyx_cache_file_list" | wc -l` - -if [[ pyx_files -ne pyx_cache_files ]] -then - echo "Different number of pyx files" - clear_cache=1 -fi - -home_dir=$(pwd) - -if [ -f "$CACHE_File" ] && [ -z "$NOCACHE" ] && [ -d "$PYX_CACHE_DIR" ]; then - - echo "Cache available - checking pyx diff" - - for i in ${pyx_file_list} - do - diff=`diff -u $i $PYX_CACHE_DIR${i}` - if [[ $? -eq 2 ]] - then - echo "${i##*/} can't be diffed; probably not in cache" - clear_cache=1 - fi - if [[ ! -z $diff ]] - then - echo "${i##*/} has changed:" - echo $diff - clear_cache=1 - fi - done - - if [ "$TRAVIS_PULL_REQUEST" == "false" ] - then - echo "Not a PR" - # Uncomment next 2 lines to turn off cython caching not in a PR - # echo "Non PR cython caching is disabled" - # clear_cache=1 - else - echo "In a PR" - # Uncomment next 2 lines to turn off cython caching in a PR - # echo "PR cython caching is disabled" - # clear_cache=1 - fi - -fi - -if [ $clear_cache -eq 0 ] && [ -z "$NOCACHE" ] -then - # No and nocache is not set - echo "Will reuse cached cython file" - cd / - tar xvmf $CACHE_File - cd $home_dir -else - echo "Rebuilding cythonized files" - echo "No cache = $NOCACHE" - echo "Clear cache (1=YES) = $clear_cache" -fi - - -exit 0 diff --git a/ci/print_skipped.py b/ci/print_skipped.py deleted file mode 100755 index 60e2f047235e6..0000000000000 --- a/ci/print_skipped.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 -import os -import xml.etree.ElementTree as et - - -def main(filename): - if not os.path.isfile(filename): - raise RuntimeError(f"Could not find junit file {repr(filename)}") - - tree = et.parse(filename) - root = tree.getroot() - current_class = "" - for el in root.iter("testcase"): - cn = el.attrib["classname"] - for sk in el.findall("skipped"): - old_class = current_class - current_class = cn - if old_class != current_class: - yield None - yield { - "class_name": current_class, - "test_name": el.attrib["name"], - "message": sk.attrib["message"], - } - - -if __name__ == "__main__": - print("SKIPPED TESTS:") - i = 1 - for test_data in main("test-data.xml"): - if test_data is None: - print("-" * 80) - else: - print( - f"#{i} {test_data['class_name']}." - f"{test_data['test_name']}: {test_data['message']}" - ) - i += 1 diff --git a/ci/run_tests.sh b/ci/run_tests.sh index fda2005ce7843..90bacef920625 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -5,14 +5,14 @@ # https://github.com/pytest-dev/pytest/issues/1075 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') +# May help reproduce flaky CI builds if set in subsequent runs +echo PYTHONHASHSEED=$PYTHONHASHSEED + if [[ "not network" == *"$PATTERN"* ]]; then export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; fi -if [ "$COVERAGE" ]; then - COVERAGE_FNAME="/tmp/test_coverage.xml" - COVERAGE="-s --cov=pandas --cov-report=xml:$COVERAGE_FNAME" -fi +COVERAGE="-s --cov=pandas --cov-report=xml --cov-append" # If no X server is found, we use xvfb to emulate it if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then @@ -20,13 +20,11 @@ if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then XVFB="xvfb-run " fi -PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s --strict --durations=30 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas" +PYTEST_CMD="${XVFB}pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" + +if [[ "$PATTERN" ]]; then + PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" +fi echo $PYTEST_CMD sh -c "$PYTEST_CMD" - -if [[ "$COVERAGE" && $? == 0 && "$TRAVIS_BRANCH" == "master" ]]; then - echo "uploading coverage" - echo "bash <(curl -s https://codecov.io/bash) -Z -c -f $COVERAGE_FNAME" - bash <(curl -s https://codecov.io/bash) -Z -c -f $COVERAGE_FNAME -fi diff --git a/ci/setup_env.sh b/ci/setup_env.sh deleted file mode 100755 index aa43d8b7dd00a..0000000000000 --- a/ci/setup_env.sh +++ /dev/null @@ -1,169 +0,0 @@ -#!/bin/bash -e - -if [ "$JOB" == "3.9-dev" ]; then - /bin/bash ci/build39.sh - exit 0 -fi - -# edit the locale file if needed -if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then - echo "Adding locale to the first line of pandas/__init__.py" - rm -f pandas/__init__.pyc - SEDC="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LC_ALL')\n" - sed -i "$SEDC" pandas/__init__.py - - echo "[head -4 pandas/__init__.py]" - head -4 pandas/__init__.py - echo -fi - -MINICONDA_DIR="$HOME/miniconda3" - - -if [ -d "$MINICONDA_DIR" ]; then - echo - echo "rm -rf "$MINICONDA_DIR"" - rm -rf "$MINICONDA_DIR" -fi - -echo "Install Miniconda" -UNAME_OS=$(uname) -if [[ "$UNAME_OS" == 'Linux' ]]; then - if [[ "$BITS32" == "yes" ]]; then - CONDA_OS="Linux-x86" - else - CONDA_OS="Linux-x86_64" - fi -elif [[ "$UNAME_OS" == 'Darwin' ]]; then - CONDA_OS="MacOSX-x86_64" -else - echo "OS $UNAME_OS not supported" - exit 1 -fi - -if [ "${TRAVIS_CPU_ARCH}" == "arm64" ]; then - sudo apt-get -y install xvfb - CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.8.2-1/Miniforge3-4.8.2-1-Linux-aarch64.sh" -else - CONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-$CONDA_OS.sh" -fi -wget -q $CONDA_URL -O miniconda.sh -chmod +x miniconda.sh - -# Installation path is required for ARM64 platform as miniforge script installs in path $HOME/miniforge3. -./miniconda.sh -b -p $MINICONDA_DIR - -export PATH=$MINICONDA_DIR/bin:$PATH - -echo -echo "which conda" -which conda - -echo -echo "update conda" -conda config --set ssl_verify false -conda config --set quiet true --set always_yes true --set changeps1 false -conda install pip conda # create conda to create a historical artifact for pip & setuptools -conda update -n base conda - -echo "conda info -a" -conda info -a - -echo -echo "set the compiler cache to work" -if [ -z "$NOCACHE" ] && [ "${TRAVIS_OS_NAME}" == "linux" ]; then - echo "Using ccache" - export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH - GCC=$(which gcc) - echo "gcc: $GCC" - CCACHE=$(which ccache) - echo "ccache: $CCACHE" - export CC='ccache gcc' -elif [ -z "$NOCACHE" ] && [ "${TRAVIS_OS_NAME}" == "osx" ]; then - echo "Install ccache" - brew install ccache > /dev/null 2>&1 - echo "Using ccache" - export PATH=/usr/local/opt/ccache/libexec:$PATH - gcc=$(which gcc) - echo "gcc: $gcc" - CCACHE=$(which ccache) - echo "ccache: $CCACHE" -else - echo "Not using ccache" -fi - -echo "source deactivate" -source deactivate - -echo "conda list (root environment)" -conda list - -# Clean up any left-over from a previous build -# (note workaround for https://github.com/conda/conda/issues/2679: -# `conda env remove` issue) -conda remove --all -q -y -n pandas-dev - -echo -echo "conda env create -q --file=${ENV_FILE}" -time conda env create -q --file="${ENV_FILE}" - - -if [[ "$BITS32" == "yes" ]]; then - # activate 32-bit compiler - export CONDA_BUILD=1 -fi - -echo "activate pandas-dev" -source activate pandas-dev - -echo -echo "remove any installed pandas package" -echo "w/o removing anything else" -conda remove pandas -y --force || true -pip uninstall -y pandas || true - -echo -echo "remove postgres if has been installed with conda" -echo "we use the one from the CI" -conda remove postgresql -y --force || true - -echo -echo "remove qt" -echo "causes problems with the clipboard, we use xsel for that" -conda remove qt -y --force || true - -echo -echo "conda list pandas" -conda list pandas - -# Make sure any error below is reported as such - -echo "[Build extensions]" -python setup.py build_ext -q -i -j2 - -# TODO: Some of our environments end up with old versions of pip (10.x) -# Adding a new enough version of pip to the requirements explodes the -# solve time. Just using pip to update itself. -# - py35_macos -# - py35_compat -# - py36_32bit -echo "[Updating pip]" -python -m pip install --no-deps -U pip wheel setuptools - -echo "[Install pandas]" -python -m pip install --no-build-isolation -e . - -echo -echo "conda list" -conda list - -# Install DB for Linux - -if [[ -n ${SQL:0} ]]; then - echo "installing dbs" - mysql -e 'create database pandas_nosetest;' - psql -c 'create database pandas_nosetest;' -U postgres -else - echo "not using dbs on non-linux Travis builds or Azure Pipelines" -fi -echo "done" diff --git a/ci/submit_cython_cache.sh b/ci/submit_cython_cache.sh deleted file mode 100755 index b87acef0ba11c..0000000000000 --- a/ci/submit_cython_cache.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -CACHE_File="$HOME/.cache/cython_files.tar" -PYX_CACHE_DIR="$HOME/.cache/pyxfiles" -pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx" -o -name "*.pxd" -o -name "*.pxi.in"` - -rm -rf $CACHE_File -rm -rf $PYX_CACHE_DIR - -home_dir=$(pwd) - -mkdir -p $PYX_CACHE_DIR -rsync -Rv $pyx_file_list $PYX_CACHE_DIR - -echo "pyx files:" -echo $pyx_file_list - -tar cf ${CACHE_File} --files-from /dev/null - -for i in ${pyx_file_list} -do - f=${i%.pyx} - ls $f.{c,cpp} | tar rf ${CACHE_File} -T - -done - -echo "Cython files in cache tar:" -tar tvf ${CACHE_File} - -exit 0 diff --git a/ci/test_wheels.py b/ci/test_wheels.py new file mode 100644 index 0000000000000..21e49bc302093 --- /dev/null +++ b/ci/test_wheels.py @@ -0,0 +1,69 @@ +import glob +import os +import shutil +import subprocess +from subprocess import CalledProcessError +import sys + +if os.name == "nt": + py_ver = f"{sys.version_info.major}.{sys.version_info.minor}" + is_32_bit = os.getenv("IS_32_BIT") == "true" + try: + wheel_dir = sys.argv[1] + wheel_path = glob.glob(f"{wheel_dir}/*.whl")[0] + except IndexError: + # Not passed + wheel_path = None + print(f"IS_32_BIT is {is_32_bit}") + print(f"Path to built wheel is {wheel_path}") + + print("Verifying file hashes in wheel RECORD file") + try: + tmp_dir = "tmp" + subprocess.run(["wheel", "unpack", wheel_path, "-d", tmp_dir], check=True) + except CalledProcessError: + print("wheel RECORD file hash verification failed.") + sys.exit(1) + finally: + shutil.rmtree(tmp_dir) + + if is_32_bit: + sys.exit(0) # No way to test Windows 32-bit(no docker image) + if wheel_path is None: + raise ValueError("Wheel path must be passed in if on 64-bit Windows") + print(f"Pulling docker image to test Windows 64-bit Python {py_ver}") + subprocess.run(f"docker pull python:{py_ver}-windowsservercore", check=True) + pandas_base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + print(f"pandas project dir is {pandas_base_dir}") + dist_dir = os.path.join(pandas_base_dir, "dist") + print(f"Copying wheel into pandas_base_dir/dist ({dist_dir})") + os.mkdir(dist_dir) + shutil.copy(wheel_path, dist_dir) + print(os.listdir(dist_dir)) + subprocess.run( + rf"docker run -v %cd%:c:\pandas " + f"python:{py_ver}-windowsservercore /pandas/ci/test_wheels_windows.bat", + check=True, + shell=True, + cwd=pandas_base_dir, + ) +else: + import pandas as pd + + pd.test( + extra_args=[ + "-m not clipboard and not single_cpu", + "--skip-slow", + "--skip-network", + "--skip-db", + "-n=2", + ] + ) + pd.test( + extra_args=[ + "-m not clipboard and single_cpu", + "--skip-slow", + "--skip-network", + "--skip-db", + ] + ) diff --git a/ci/test_wheels_windows.bat b/ci/test_wheels_windows.bat new file mode 100644 index 0000000000000..ba6cfef8f2dfb --- /dev/null +++ b/ci/test_wheels_windows.bat @@ -0,0 +1,9 @@ +set test_command=import pandas as pd; print(pd.__version__); ^ +pd.test(extra_args=['-m not clipboard and not single_cpu', '--skip-slow', '--skip-network', '--skip-db', '-n=2']); ^ +pd.test(extra_args=['-m not clipboard and single_cpu', '--skip-slow', '--skip-network', '--skip-db']) + +python --version +pip install pytz six numpy python-dateutil tzdata>=2022.1 +pip install hypothesis>=6.34.2 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 +pip install --find-links=pandas/dist --no-index pandas +python -c "%test_command%" diff --git a/ci/travis_encrypt_gbq.sh b/ci/travis_encrypt_gbq.sh deleted file mode 100755 index 7d5692d9520af..0000000000000 --- a/ci/travis_encrypt_gbq.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -GBQ_JSON_FILE=$1 - -if [[ $# -ne 1 ]]; then - echo -e "Too few arguments.\nUsage: ./travis_encrypt_gbq.sh "\ - "" - exit 1 -fi - -if [[ $GBQ_JSON_FILE != *.json ]]; then - echo "ERROR: Expected *.json file" - exit 1 -fi - -if [[ ! -f $GBQ_JSON_FILE ]]; then - echo "ERROR: File $GBQ_JSON_FILE does not exist" - exit 1 -fi - -echo "Encrypting $GBQ_JSON_FILE..." -read -d "\n" TRAVIS_KEY TRAVIS_IV <<<$(travis encrypt-file -r pandas-dev/pandas $GBQ_JSON_FILE \ -travis_gbq.json.enc -f | grep -o "\w*_iv\|\w*_key"); - -echo "Adding your secure key to travis_gbq_config.txt ..." -echo -e "TRAVIS_IV_ENV=$TRAVIS_IV\nTRAVIS_KEY_ENV=$TRAVIS_KEY"\ -> travis_gbq_config.txt - -echo "Done. Removing file $GBQ_JSON_FILE" -rm $GBQ_JSON_FILE - -echo -e "Created encrypted credentials file travis_gbq.json.enc.\n"\ - "NOTE: Do NOT commit the *.json file containing your unencrypted" \ - "private key" diff --git a/ci/travis_gbq.json.enc b/ci/travis_gbq.json.enc deleted file mode 100644 index 6e0b6cee4048c..0000000000000 Binary files a/ci/travis_gbq.json.enc and /dev/null differ diff --git a/ci/travis_gbq_config.txt b/ci/travis_gbq_config.txt deleted file mode 100644 index dc857c450331c..0000000000000 --- a/ci/travis_gbq_config.txt +++ /dev/null @@ -1,2 +0,0 @@ -TRAVIS_IV_ENV=encrypted_e05c934e101e_iv -TRAVIS_KEY_ENV=encrypted_e05c934e101e_key diff --git a/ci/travis_process_gbq_encryption.sh b/ci/travis_process_gbq_encryption.sh deleted file mode 100755 index fccf8e1e8deff..0000000000000 --- a/ci/travis_process_gbq_encryption.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -source ci/travis_gbq_config.txt - -if [[ -n ${SERVICE_ACCOUNT_KEY} ]]; then - echo "${SERVICE_ACCOUNT_KEY}" > ci/travis_gbq.json; -elif [[ -n ${!TRAVIS_IV_ENV} ]]; then - openssl aes-256-cbc -K ${!TRAVIS_KEY_ENV} -iv ${!TRAVIS_IV_ENV} \ - -in ci/travis_gbq.json.enc -out ci/travis_gbq.json -d; - export GBQ_PROJECT_ID='pandas-gbq-tests'; - echo 'Successfully decrypted gbq credentials' -fi - diff --git a/ci/upload_wheels.sh b/ci/upload_wheels.sh new file mode 100644 index 0000000000000..f760621ea0e6b --- /dev/null +++ b/ci/upload_wheels.sh @@ -0,0 +1,41 @@ +# Modified from numpy's https://github.com/numpy/numpy/blob/main/tools/wheels/upload_wheels.sh + +set_upload_vars() { + echo "IS_PUSH is $IS_PUSH" + echo "IS_SCHEDULE_DISPATCH is $IS_SCHEDULE_DISPATCH" + if [[ "$IS_PUSH" == "true" ]]; then + echo push and tag event + export ANACONDA_ORG="multibuild-wheels-staging" + export TOKEN="$PANDAS_STAGING_UPLOAD_TOKEN" + export ANACONDA_UPLOAD="true" + elif [[ "$IS_SCHEDULE_DISPATCH" == "true" ]]; then + echo scheduled or dispatched event + export ANACONDA_ORG="scipy-wheels-nightly" + export TOKEN="$PANDAS_NIGHTLY_UPLOAD_TOKEN" + export ANACONDA_UPLOAD="true" + else + echo non-dispatch event + export ANACONDA_UPLOAD="false" + fi +} +upload_wheels() { + echo ${PWD} + if [[ ${ANACONDA_UPLOAD} == true ]]; then + if [ -z ${TOKEN} ]; then + echo no token set, not uploading + else + # sdists are located under dist folder when built through setup.py + if compgen -G "./dist/*.gz"; then + echo "Found sdist" + anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./dist/*.gz + elif compgen -G "./wheelhouse/*.whl"; then + echo "Found wheel" + anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./wheelhouse/*.whl + else + echo "Files do not exist" + return 1 + fi + echo "PyPI-style index: https://pypi.anaconda.org/$ANACONDA_ORG/simple" + fi + fi +} diff --git a/codecov.yml b/codecov.yml index 1644bf315e0ac..d893bdbdc9298 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,7 +1,8 @@ codecov: - branch: master - -comment: off + branch: main + notify: + after_n_builds: 10 +comment: false coverage: status: @@ -11,3 +12,7 @@ coverage: patch: default: target: '50' + informational: true + +github_checks: + annotations: false diff --git a/conda.recipe/bld.bat b/conda.recipe/bld.bat deleted file mode 100644 index 284926fae8c04..0000000000000 --- a/conda.recipe/bld.bat +++ /dev/null @@ -1,2 +0,0 @@ -@echo off -%PYTHON% setup.py install diff --git a/conda.recipe/build.sh b/conda.recipe/build.sh deleted file mode 100644 index f341bce6fcf96..0000000000000 --- a/conda.recipe/build.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -$PYTHON setup.py install diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml deleted file mode 100644 index e833ea1f1f398..0000000000000 --- a/conda.recipe/meta.yaml +++ /dev/null @@ -1,40 +0,0 @@ -package: - name: pandas - version: {{ environ.get('GIT_DESCRIBE_TAG','').replace('v', '', 1) }} - -build: - number: {{ environ.get('GIT_DESCRIBE_NUMBER', 0) }} - {% if GIT_DESCRIBE_NUMBER|int == 0 %}string: np{{ CONDA_NPY }}py{{ CONDA_PY }}_0 - {% else %}string: np{{ CONDA_NPY }}py{{ CONDA_PY }}_{{ GIT_BUILD_STR }}{% endif %} - -source: - git_url: ../ - -requirements: - build: - - {{ compiler('c') }} - - {{ compiler('cxx') }} - host: - - python - - pip - - cython - - numpy - - setuptools >=3.3 - - python-dateutil >=2.7.3 - - pytz - run: - - python {{ python }} - - {{ pin_compatible('numpy') }} - - python-dateutil >=2.7.3 - - pytz - -test: - requires: - - pytest - commands: - - python -c "import pandas; pandas.test()" - - -about: - home: https://pandas.pydata.org - license: BSD diff --git a/doc/README.rst b/doc/README.rst deleted file mode 100644 index 5423e7419d03b..0000000000000 --- a/doc/README.rst +++ /dev/null @@ -1 +0,0 @@ -See `contributing.rst `_ in this repo. diff --git a/doc/_templates/pandas_footer.html b/doc/_templates/pandas_footer.html new file mode 100644 index 0000000000000..6d8caa4d6c741 --- /dev/null +++ b/doc/_templates/pandas_footer.html @@ -0,0 +1,3 @@ + diff --git a/doc/_templates/sidebar-nav-bs.html b/doc/_templates/sidebar-nav-bs.html new file mode 100644 index 0000000000000..8298b66568e20 --- /dev/null +++ b/doc/_templates/sidebar-nav-bs.html @@ -0,0 +1,9 @@ + diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf index 48da05d053b96..3582e0c0dabf9 100644 Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf and b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx index 039b3898fa301..746f508516964 100644 Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx and b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx differ diff --git a/doc/data/fx_prices b/doc/data/fx_prices deleted file mode 100644 index 38cadf26909a3..0000000000000 Binary files a/doc/data/fx_prices and /dev/null differ diff --git a/doc/data/iris.data b/doc/data/iris.data index c19b9c3688515..026e214e5f754 100644 --- a/doc/data/iris.data +++ b/doc/data/iris.data @@ -148,4 +148,4 @@ SepalLength,SepalWidth,PetalLength,PetalWidth,Name 6.3,2.5,5.0,1.9,Iris-virginica 6.5,3.0,5.2,2.0,Iris-virginica 6.2,3.4,5.4,2.3,Iris-virginica -5.9,3.0,5.1,1.8,Iris-virginica \ No newline at end of file +5.9,3.0,5.1,1.8,Iris-virginica diff --git a/doc/data/mindex_ex.csv b/doc/data/mindex_ex.csv deleted file mode 100644 index 935ff936cd842..0000000000000 --- a/doc/data/mindex_ex.csv +++ /dev/null @@ -1,16 +0,0 @@ -year,indiv,zit,xit -1977,"A",1.2,.6 -1977,"B",1.5,.5 -1977,"C",1.7,.8 -1978,"A",.2,.06 -1978,"B",.7,.2 -1978,"C",.8,.3 -1978,"D",.9,.5 -1978,"E",1.4,.9 -1979,"C",.2,.15 -1979,"D",.14,.05 -1979,"E",.5,.15 -1979,"F",1.2,.5 -1979,"G",3.4,1.9 -1979,"H",5.4,2.7 -1979,"I",6.4,1.2 diff --git a/doc/data/test.xls b/doc/data/test.xls deleted file mode 100644 index db0f9dec7d5e4..0000000000000 Binary files a/doc/data/test.xls and /dev/null differ diff --git a/doc/make.py b/doc/make.py index db729853e5834..f5bf170c6274d 100755 --- a/doc/make.py +++ b/doc/make.py @@ -39,22 +39,26 @@ class DocBuilder: def __init__( self, - num_jobs=0, + num_jobs="auto", include_api=True, + whatsnew=False, single_doc=None, verbosity=0, warnings_are_errors=False, - ): + ) -> None: self.num_jobs = num_jobs + self.include_api = include_api + self.whatsnew = whatsnew self.verbosity = verbosity self.warnings_are_errors = warnings_are_errors if single_doc: single_doc = self._process_single_doc(single_doc) - include_api = False os.environ["SPHINX_PATTERN"] = single_doc elif not include_api: os.environ["SPHINX_PATTERN"] = "-api" + elif whatsnew: + os.environ["SPHINX_PATTERN"] = "whatsnew" self.single_doc_html = None if single_doc and single_doc.endswith(".rst"): @@ -130,7 +134,7 @@ def _sphinx_build(self, kind: str): cmd = ["sphinx-build", "-b", kind] if self.num_jobs: - cmd += ["-j", str(self.num_jobs)] + cmd += ["-j", self.num_jobs] if self.warnings_are_errors: cmd += ["-W", "--keep-going"] if self.verbosity: @@ -188,7 +192,14 @@ def _add_redirects(self): if not row or row[0].strip().startswith("#"): continue - path = os.path.join(BUILD_PATH, "html", *row[0].split("/")) + ".html" + html_path = os.path.join(BUILD_PATH, "html") + path = os.path.join(html_path, *row[0].split("/")) + ".html" + + if not self.include_api and ( + os.path.join(html_path, "reference") in path + or os.path.join(html_path, "generated") in path + ): + continue try: title = self._get_page_title(row[1]) @@ -198,11 +209,6 @@ def _add_redirects(self): # sphinx specific stuff title = "this page" - if os.path.exists(path): - raise RuntimeError( - f"Redirection would overwrite an existing file: {path}" - ) - with open(path, "w") as moved_page_fd: html = f"""\ @@ -232,6 +238,9 @@ def html(self): self._open_browser(self.single_doc_html) else: self._add_redirects() + if self.whatsnew: + self._open_browser(os.path.join("whatsnew", "index.html")) + return ret_code def latex(self, force=False): @@ -250,8 +259,7 @@ def latex(self, force=False): "You should check the file " '"build/latex/pandas.pdf" for problems.' ) - else: - self._run_os("make") + self._run_os("make") return ret_code def latex_forced(self): @@ -286,19 +294,25 @@ def main(): joined = ",".join(cmds) argparser = argparse.ArgumentParser( - description="pandas documentation builder", epilog=f"Commands: {joined}", + description="pandas documentation builder", epilog=f"Commands: {joined}" ) joined = ", ".join(cmds) argparser.add_argument( - "command", nargs="?", default="html", help=f"command to run: {joined}", + "command", nargs="?", default="html", help=f"command to run: {joined}" ) argparser.add_argument( - "--num-jobs", type=int, default=0, help="number of jobs used by sphinx-build" + "--num-jobs", default="auto", help="number of jobs used by sphinx-build" ) argparser.add_argument( "--no-api", default=False, help="omit api and autosummary", action="https://github.com/store_true" ) + argparser.add_argument( + "--whatsnew", + default=False, + help="only build whatsnew (and api for links)", + action="https://github.com/store_true", + ) argparser.add_argument( "--single", metavar="FILENAME", @@ -350,6 +364,7 @@ def main(): builder = DocBuilder( args.num_jobs, not args.no_api, + args.whatsnew, args.single, args.verbosity, args.warnings_are_errors, diff --git a/doc/redirects.csv b/doc/redirects.csv index bceb4b5961324..97cd20b295e65 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -45,6 +45,7 @@ contributing_docstring,development/contributing_docstring developer,development/developer extending,development/extending internals,development/internals +development/meeting,community # api moved function reference/api/pandas.io.json.json_normalize,pandas.json_normalize @@ -100,7 +101,6 @@ generated/pandas.api.types.infer_dtype,../reference/api/pandas.api.types.infer_d generated/pandas.api.types.is_bool_dtype,../reference/api/pandas.api.types.is_bool_dtype generated/pandas.api.types.is_bool,../reference/api/pandas.api.types.is_bool generated/pandas.api.types.is_categorical_dtype,../reference/api/pandas.api.types.is_categorical_dtype -generated/pandas.api.types.is_categorical,../reference/api/pandas.api.types.is_categorical generated/pandas.api.types.is_complex_dtype,../reference/api/pandas.api.types.is_complex_dtype generated/pandas.api.types.is_complex,../reference/api/pandas.api.types.is_complex generated/pandas.api.types.is_datetime64_any_dtype,../reference/api/pandas.api.types.is_datetime64_any_dtype @@ -185,7 +185,6 @@ generated/pandas.core.groupby.DataFrameGroupBy.filter,../reference/api/pandas.co generated/pandas.core.groupby.DataFrameGroupBy.hist,../reference/api/pandas.core.groupby.DataFrameGroupBy.hist generated/pandas.core.groupby.DataFrameGroupBy.idxmax,../reference/api/pandas.core.groupby.DataFrameGroupBy.idxmax generated/pandas.core.groupby.DataFrameGroupBy.idxmin,../reference/api/pandas.core.groupby.DataFrameGroupBy.idxmin -generated/pandas.core.groupby.DataFrameGroupBy.mad,../reference/api/pandas.core.groupby.DataFrameGroupBy.mad generated/pandas.core.groupby.DataFrameGroupBy.pct_change,../reference/api/pandas.core.groupby.DataFrameGroupBy.pct_change generated/pandas.core.groupby.DataFrameGroupBy.plot,../reference/api/pandas.core.groupby.DataFrameGroupBy.plot generated/pandas.core.groupby.DataFrameGroupBy.quantile,../reference/api/pandas.core.groupby.DataFrameGroupBy.quantile @@ -195,7 +194,6 @@ generated/pandas.core.groupby.DataFrameGroupBy.shift,../reference/api/pandas.cor generated/pandas.core.groupby.DataFrameGroupBy.size,../reference/api/pandas.core.groupby.DataFrameGroupBy.size generated/pandas.core.groupby.DataFrameGroupBy.skew,../reference/api/pandas.core.groupby.DataFrameGroupBy.skew generated/pandas.core.groupby.DataFrameGroupBy.take,../reference/api/pandas.core.groupby.DataFrameGroupBy.take -generated/pandas.core.groupby.DataFrameGroupBy.tshift,../reference/api/pandas.core.groupby.DataFrameGroupBy.tshift generated/pandas.core.groupby.GroupBy.agg,../reference/api/pandas.core.groupby.GroupBy.agg generated/pandas.core.groupby.GroupBy.aggregate,../reference/api/pandas.core.groupby.GroupBy.aggregate generated/pandas.core.groupby.GroupBy.all,../reference/api/pandas.core.groupby.GroupBy.all @@ -317,7 +315,6 @@ generated/pandas.DataFrame.aggregate,../reference/api/pandas.DataFrame.aggregate generated/pandas.DataFrame.align,../reference/api/pandas.DataFrame.align generated/pandas.DataFrame.all,../reference/api/pandas.DataFrame.all generated/pandas.DataFrame.any,../reference/api/pandas.DataFrame.any -generated/pandas.DataFrame.append,../reference/api/pandas.DataFrame.append generated/pandas.DataFrame.apply,../reference/api/pandas.DataFrame.apply generated/pandas.DataFrame.applymap,../reference/api/pandas.DataFrame.applymap generated/pandas.DataFrame.as_blocks,../reference/api/pandas.DataFrame.as_blocks @@ -400,10 +397,8 @@ generated/pandas.DataFrame.isna,../reference/api/pandas.DataFrame.isna generated/pandas.DataFrame.isnull,../reference/api/pandas.DataFrame.isnull generated/pandas.DataFrame.items,../reference/api/pandas.DataFrame.items generated/pandas.DataFrame.__iter__,../reference/api/pandas.DataFrame.__iter__ -generated/pandas.DataFrame.iteritems,../reference/api/pandas.DataFrame.iteritems generated/pandas.DataFrame.iterrows,../reference/api/pandas.DataFrame.iterrows generated/pandas.DataFrame.itertuples,../reference/api/pandas.DataFrame.itertuples -generated/pandas.DataFrame.ix,../reference/api/pandas.DataFrame.ix generated/pandas.DataFrame.join,../reference/api/pandas.DataFrame.join generated/pandas.DataFrame.keys,../reference/api/pandas.DataFrame.keys generated/pandas.DataFrame.kurt,../reference/api/pandas.DataFrame.kurt @@ -412,9 +407,7 @@ generated/pandas.DataFrame.last,../reference/api/pandas.DataFrame.last generated/pandas.DataFrame.last_valid_index,../reference/api/pandas.DataFrame.last_valid_index generated/pandas.DataFrame.le,../reference/api/pandas.DataFrame.le generated/pandas.DataFrame.loc,../reference/api/pandas.DataFrame.loc -generated/pandas.DataFrame.lookup,../reference/api/pandas.DataFrame.lookup generated/pandas.DataFrame.lt,../reference/api/pandas.DataFrame.lt -generated/pandas.DataFrame.mad,../reference/api/pandas.DataFrame.mad generated/pandas.DataFrame.mask,../reference/api/pandas.DataFrame.mask generated/pandas.DataFrame.max,../reference/api/pandas.DataFrame.max generated/pandas.DataFrame.mean,../reference/api/pandas.DataFrame.mean @@ -486,7 +479,6 @@ generated/pandas.DataFrame.shape,../reference/api/pandas.DataFrame.shape generated/pandas.DataFrame.shift,../reference/api/pandas.DataFrame.shift generated/pandas.DataFrame.size,../reference/api/pandas.DataFrame.size generated/pandas.DataFrame.skew,../reference/api/pandas.DataFrame.skew -generated/pandas.DataFrame.slice_shift,../reference/api/pandas.DataFrame.slice_shift generated/pandas.DataFrame.sort_index,../reference/api/pandas.DataFrame.sort_index generated/pandas.DataFrame.sort_values,../reference/api/pandas.DataFrame.sort_values generated/pandas.DataFrame.squeeze,../reference/api/pandas.DataFrame.squeeze @@ -527,7 +519,6 @@ generated/pandas.DataFrame.transform,../reference/api/pandas.DataFrame.transform generated/pandas.DataFrame.transpose,../reference/api/pandas.DataFrame.transpose generated/pandas.DataFrame.truediv,../reference/api/pandas.DataFrame.truediv generated/pandas.DataFrame.truncate,../reference/api/pandas.DataFrame.truncate -generated/pandas.DataFrame.tshift,../reference/api/pandas.DataFrame.tshift generated/pandas.DataFrame.tz_convert,../reference/api/pandas.DataFrame.tz_convert generated/pandas.DataFrame.tz_localize,../reference/api/pandas.DataFrame.tz_localize generated/pandas.DataFrame.unstack,../reference/api/pandas.DataFrame.unstack @@ -542,7 +533,9 @@ generated/pandas.DatetimeIndex.date,../reference/api/pandas.DatetimeIndex.date generated/pandas.DatetimeIndex.day,../reference/api/pandas.DatetimeIndex.day generated/pandas.DatetimeIndex.day_name,../reference/api/pandas.DatetimeIndex.day_name generated/pandas.DatetimeIndex.dayofweek,../reference/api/pandas.DatetimeIndex.dayofweek +generated/pandas.DatetimeIndex.day_of_week,../reference/api/pandas.DatetimeIndex.day_of_week generated/pandas.DatetimeIndex.dayofyear,../reference/api/pandas.DatetimeIndex.dayofyear +generated/pandas.DatetimeIndex.day_of_year,../reference/api/pandas.DatetimeIndex.day_of_year generated/pandas.DatetimeIndex.floor,../reference/api/pandas.DatetimeIndex.floor generated/pandas.DatetimeIndex.freq,../reference/api/pandas.DatetimeIndex.freq generated/pandas.DatetimeIndex.freqstr,../reference/api/pandas.DatetimeIndex.freqstr @@ -572,7 +565,6 @@ generated/pandas.DatetimeIndex.strftime,../reference/api/pandas.DatetimeIndex.st generated/pandas.DatetimeIndex.time,../reference/api/pandas.DatetimeIndex.time generated/pandas.DatetimeIndex.timetz,../reference/api/pandas.DatetimeIndex.timetz generated/pandas.DatetimeIndex.to_frame,../reference/api/pandas.DatetimeIndex.to_frame -generated/pandas.DatetimeIndex.to_perioddelta,../reference/api/pandas.DatetimeIndex.to_perioddelta generated/pandas.DatetimeIndex.to_period,../reference/api/pandas.DatetimeIndex.to_period generated/pandas.DatetimeIndex.to_pydatetime,../reference/api/pandas.DatetimeIndex.to_pydatetime generated/pandas.DatetimeIndex.to_series,../reference/api/pandas.DatetimeIndex.to_series @@ -634,7 +626,6 @@ generated/pandas.Index.argmax,../reference/api/pandas.Index.argmax generated/pandas.Index.argmin,../reference/api/pandas.Index.argmin generated/pandas.Index.argsort,../reference/api/pandas.Index.argsort generated/pandas.Index.array,../reference/api/pandas.Index.array -generated/pandas.Index.asi8,../reference/api/pandas.Index.asi8 generated/pandas.Index.asof,../reference/api/pandas.Index.asof generated/pandas.Index.asof_locs,../reference/api/pandas.Index.asof_locs generated/pandas.Index.astype,../reference/api/pandas.Index.astype @@ -659,7 +650,6 @@ generated/pandas.Index.get_indexer_non_unique,../reference/api/pandas.Index.get_ generated/pandas.Index.get_level_values,../reference/api/pandas.Index.get_level_values generated/pandas.Index.get_loc,../reference/api/pandas.Index.get_loc generated/pandas.Index.get_slice_bound,../reference/api/pandas.Index.get_slice_bound -generated/pandas.Index.get_value,../reference/api/pandas.Index.get_value generated/pandas.Index.groupby,../reference/api/pandas.Index.groupby generated/pandas.Index.has_duplicates,../reference/api/pandas.Index.has_duplicates generated/pandas.Index.hasnans,../reference/api/pandas.Index.hasnans @@ -669,7 +659,6 @@ generated/pandas.Index.identical,../reference/api/pandas.Index.identical generated/pandas.Index.inferred_type,../reference/api/pandas.Index.inferred_type generated/pandas.Index.insert,../reference/api/pandas.Index.insert generated/pandas.Index.intersection,../reference/api/pandas.Index.intersection -generated/pandas.Index.is_all_dates,../reference/api/pandas.Index.is_all_dates generated/pandas.Index.is_boolean,../reference/api/pandas.Index.is_boolean generated/pandas.Index.is_categorical,../reference/api/pandas.Index.is_categorical generated/pandas.Index.is_floating,../reference/api/pandas.Index.is_floating @@ -678,15 +667,12 @@ generated/pandas.Index.isin,../reference/api/pandas.Index.isin generated/pandas.Index.is_integer,../reference/api/pandas.Index.is_integer generated/pandas.Index.is_interval,../reference/api/pandas.Index.is_interval generated/pandas.Index.is_lexsorted_for_tuple,../reference/api/pandas.Index.is_lexsorted_for_tuple -generated/pandas.Index.is_mixed,../reference/api/pandas.Index.is_mixed generated/pandas.Index.is_monotonic_decreasing,../reference/api/pandas.Index.is_monotonic_decreasing -generated/pandas.Index.is_monotonic,../reference/api/pandas.Index.is_monotonic generated/pandas.Index.is_monotonic_increasing,../reference/api/pandas.Index.is_monotonic_increasing generated/pandas.Index.isna,../reference/api/pandas.Index.isna generated/pandas.Index.isnull,../reference/api/pandas.Index.isnull generated/pandas.Index.is_numeric,../reference/api/pandas.Index.is_numeric generated/pandas.Index.is_object,../reference/api/pandas.Index.is_object -generated/pandas.Index.is_type_compatible,../reference/api/pandas.Index.is_type_compatible generated/pandas.Index.is_unique,../reference/api/pandas.Index.is_unique generated/pandas.Index.item,../reference/api/pandas.Index.item generated/pandas.Index.join,../reference/api/pandas.Index.join @@ -709,7 +695,6 @@ generated/pandas.Index.rename,../reference/api/pandas.Index.rename generated/pandas.Index.repeat,../reference/api/pandas.Index.repeat generated/pandas.Index.searchsorted,../reference/api/pandas.Index.searchsorted generated/pandas.Index.set_names,../reference/api/pandas.Index.set_names -generated/pandas.Index.set_value,../reference/api/pandas.Index.set_value generated/pandas.Index.shape,../reference/api/pandas.Index.shape generated/pandas.Index.shift,../reference/api/pandas.Index.shift generated/pandas.Index.size,../reference/api/pandas.Index.size @@ -728,7 +713,6 @@ generated/pandas.Index.to_flat_index,../reference/api/pandas.Index.to_flat_index generated/pandas.Index.to_frame,../reference/api/pandas.Index.to_frame generated/pandas.Index.to_list,../reference/api/pandas.Index.to_list generated/pandas.Index.tolist,../reference/api/pandas.Index.tolist -generated/pandas.Index.to_native_types,../reference/api/pandas.Index.to_native_types generated/pandas.Index.to_numpy,../reference/api/pandas.Index.to_numpy generated/pandas.Index.to_series,../reference/api/pandas.Index.to_series generated/pandas.Index.transpose,../reference/api/pandas.Index.transpose @@ -839,7 +823,9 @@ generated/pandas.option_context,../reference/api/pandas.option_context generated/pandas.Period.asfreq,../reference/api/pandas.Period.asfreq generated/pandas.Period.day,../reference/api/pandas.Period.day generated/pandas.Period.dayofweek,../reference/api/pandas.Period.dayofweek +generated/pandas.Period.day_of_week,../reference/api/pandas.Period.day_of_week generated/pandas.Period.dayofyear,../reference/api/pandas.Period.dayofyear +generated/pandas.Period.day_of_year,../reference/api/pandas.Period.day_of_year generated/pandas.Period.days_in_month,../reference/api/pandas.Period.days_in_month generated/pandas.Period.daysinmonth,../reference/api/pandas.Period.daysinmonth generated/pandas.Period.end_time,../reference/api/pandas.Period.end_time @@ -850,7 +836,9 @@ generated/pandas.Period,../reference/api/pandas.Period generated/pandas.PeriodIndex.asfreq,../reference/api/pandas.PeriodIndex.asfreq generated/pandas.PeriodIndex.day,../reference/api/pandas.PeriodIndex.day generated/pandas.PeriodIndex.dayofweek,../reference/api/pandas.PeriodIndex.dayofweek +generated/pandas.PeriodIndex.day_of_week,../reference/api/pandas.PeriodIndex.day_of_week generated/pandas.PeriodIndex.dayofyear,../reference/api/pandas.PeriodIndex.dayofyear +generated/pandas.PeriodIndex.day_of_year,../reference/api/pandas.PeriodIndex.day_of_year generated/pandas.PeriodIndex.days_in_month,../reference/api/pandas.PeriodIndex.days_in_month generated/pandas.PeriodIndex.daysinmonth,../reference/api/pandas.PeriodIndex.daysinmonth generated/pandas.PeriodIndex.end_time,../reference/api/pandas.PeriodIndex.end_time @@ -927,7 +915,6 @@ generated/pandas.Series.aggregate,../reference/api/pandas.Series.aggregate generated/pandas.Series.align,../reference/api/pandas.Series.align generated/pandas.Series.all,../reference/api/pandas.Series.all generated/pandas.Series.any,../reference/api/pandas.Series.any -generated/pandas.Series.append,../reference/api/pandas.Series.append generated/pandas.Series.apply,../reference/api/pandas.Series.apply generated/pandas.Series.argmax,../reference/api/pandas.Series.argmax generated/pandas.Series.argmin,../reference/api/pandas.Series.argmin @@ -993,7 +980,9 @@ generated/pandas.Series.dt.date,../reference/api/pandas.Series.dt.date generated/pandas.Series.dt.day,../reference/api/pandas.Series.dt.day generated/pandas.Series.dt.day_name,../reference/api/pandas.Series.dt.day_name generated/pandas.Series.dt.dayofweek,../reference/api/pandas.Series.dt.dayofweek +generated/pandas.Series.dt.day_of_week,../reference/api/pandas.Series.dt.day_of_week generated/pandas.Series.dt.dayofyear,../reference/api/pandas.Series.dt.dayofyear +generated/pandas.Series.dt.day_of_year,../reference/api/pandas.Series.dt.day_of_year generated/pandas.Series.dt.days,../reference/api/pandas.Series.dt.days generated/pandas.Series.dt.days_in_month,../reference/api/pandas.Series.dt.days_in_month generated/pandas.Series.dt.daysinmonth,../reference/api/pandas.Series.dt.daysinmonth @@ -1074,7 +1063,6 @@ generated/pandas.Series.interpolate,../reference/api/pandas.Series.interpolate generated/pandas.Series.is_copy,../reference/api/pandas.Series.is_copy generated/pandas.Series.isin,../reference/api/pandas.Series.isin generated/pandas.Series.is_monotonic_decreasing,../reference/api/pandas.Series.is_monotonic_decreasing -generated/pandas.Series.is_monotonic,../reference/api/pandas.Series.is_monotonic generated/pandas.Series.is_monotonic_increasing,../reference/api/pandas.Series.is_monotonic_increasing generated/pandas.Series.isna,../reference/api/pandas.Series.isna generated/pandas.Series.isnull,../reference/api/pandas.Series.isnull @@ -1082,8 +1070,6 @@ generated/pandas.Series.is_unique,../reference/api/pandas.Series.is_unique generated/pandas.Series.item,../reference/api/pandas.Series.item generated/pandas.Series.items,../reference/api/pandas.Series.items generated/pandas.Series.__iter__,../reference/api/pandas.Series.__iter__ -generated/pandas.Series.iteritems,../reference/api/pandas.Series.iteritems -generated/pandas.Series.ix,../reference/api/pandas.Series.ix generated/pandas.Series.keys,../reference/api/pandas.Series.keys generated/pandas.Series.kurt,../reference/api/pandas.Series.kurt generated/pandas.Series.kurtosis,../reference/api/pandas.Series.kurtosis @@ -1092,7 +1078,6 @@ generated/pandas.Series.last_valid_index,../reference/api/pandas.Series.last_val generated/pandas.Series.le,../reference/api/pandas.Series.le generated/pandas.Series.loc,../reference/api/pandas.Series.loc generated/pandas.Series.lt,../reference/api/pandas.Series.lt -generated/pandas.Series.mad,../reference/api/pandas.Series.mad generated/pandas.Series.map,../reference/api/pandas.Series.map generated/pandas.Series.mask,../reference/api/pandas.Series.mask generated/pandas.Series.max,../reference/api/pandas.Series.max @@ -1166,7 +1151,6 @@ generated/pandas.Series.shape,../reference/api/pandas.Series.shape generated/pandas.Series.shift,../reference/api/pandas.Series.shift generated/pandas.Series.size,../reference/api/pandas.Series.size generated/pandas.Series.skew,../reference/api/pandas.Series.skew -generated/pandas.Series.slice_shift,../reference/api/pandas.Series.slice_shift generated/pandas.Series.sort_index,../reference/api/pandas.Series.sort_index generated/pandas.Series.sort_values,../reference/api/pandas.Series.sort_values generated/pandas.Series.sparse.density,../reference/api/pandas.Series.sparse.density @@ -1189,6 +1173,7 @@ generated/pandas.Series.str.extractall,../reference/api/pandas.Series.str.extrac generated/pandas.Series.str.extract,../reference/api/pandas.Series.str.extract generated/pandas.Series.str.findall,../reference/api/pandas.Series.str.findall generated/pandas.Series.str.find,../reference/api/pandas.Series.str.find +generated/pandas.Series.str.fullmatch,../reference/api/pandas.Series.str.fullmatch generated/pandas.Series.str.get_dummies,../reference/api/pandas.Series.str.get_dummies generated/pandas.Series.str.get,../reference/api/pandas.Series.str.get generated/pandas.Series.str,../reference/api/pandas.Series.str @@ -1260,7 +1245,6 @@ generated/pandas.Series.transform,../reference/api/pandas.Series.transform generated/pandas.Series.transpose,../reference/api/pandas.Series.transpose generated/pandas.Series.truediv,../reference/api/pandas.Series.truediv generated/pandas.Series.truncate,../reference/api/pandas.Series.truncate -generated/pandas.Series.tshift,../reference/api/pandas.Series.tshift generated/pandas.Series.tz_convert,../reference/api/pandas.Series.tz_convert generated/pandas.Series.tz_localize,../reference/api/pandas.Series.tz_localize generated/pandas.Series.unique,../reference/api/pandas.Series.unique @@ -1326,14 +1310,14 @@ generated/pandas.Timestamp.date,../reference/api/pandas.Timestamp.date generated/pandas.Timestamp.day,../reference/api/pandas.Timestamp.day generated/pandas.Timestamp.day_name,../reference/api/pandas.Timestamp.day_name generated/pandas.Timestamp.dayofweek,../reference/api/pandas.Timestamp.dayofweek +generated/pandas.Timestamp.day_of_week,../reference/api/pandas.Timestamp.day_of_week generated/pandas.Timestamp.dayofyear,../reference/api/pandas.Timestamp.dayofyear +generated/pandas.Timestamp.day_of_year,../reference/api/pandas.Timestamp.day_of_year generated/pandas.Timestamp.days_in_month,../reference/api/pandas.Timestamp.days_in_month generated/pandas.Timestamp.daysinmonth,../reference/api/pandas.Timestamp.daysinmonth generated/pandas.Timestamp.dst,../reference/api/pandas.Timestamp.dst generated/pandas.Timestamp.floor,../reference/api/pandas.Timestamp.floor generated/pandas.Timestamp.fold,../reference/api/pandas.Timestamp.fold -generated/pandas.Timestamp.freq,../reference/api/pandas.Timestamp.freq -generated/pandas.Timestamp.freqstr,../reference/api/pandas.Timestamp.freqstr generated/pandas.Timestamp.fromisoformat,../reference/api/pandas.Timestamp.fromisoformat generated/pandas.Timestamp.fromordinal,../reference/api/pandas.Timestamp.fromordinal generated/pandas.Timestamp.fromtimestamp,../reference/api/pandas.Timestamp.fromtimestamp diff --git a/doc/scripts/eval_performance.py b/doc/scripts/eval_performance.py new file mode 100644 index 0000000000000..559689ef5915b --- /dev/null +++ b/doc/scripts/eval_performance.py @@ -0,0 +1,108 @@ +from timeit import repeat as timeit + +import numpy as np +import seaborn as sns + +from pandas import DataFrame + +setup_common = """from pandas import DataFrame +import numpy as np +df = DataFrame(np.random.randn(%d, 3), columns=list('abc')) +%s""" + +setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'" + + +def bench_with(n, times=10, repeat=3, engine="numexpr"): + return ( + np.array( + timeit( + f"df.eval(s, engine={repr(engine)})", + setup=setup_common % (n, setup_with), + repeat=repeat, + number=times, + ) + ) + / times + ) + + +setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'" + + +def bench_subset(n, times=20, repeat=3, engine="numexpr"): + return ( + np.array( + timeit( + f"df.query(s, engine={repr(engine)})", + setup=setup_common % (n, setup_subset), + repeat=repeat, + number=times, + ) + ) + / times + ) + + +def bench(mn=3, mx=7, num=100, engines=("python", "numexpr"), verbose=False): + r = np.logspace(mn, mx, num=num).round().astype(int) + + ev = DataFrame(np.empty((num, len(engines))), columns=engines) + qu = ev.copy(deep=True) + + ev["size"] = qu["size"] = r + + for engine in engines: + for i, n in enumerate(r): + if verbose & (i % 10 == 0): + print(f"engine: {repr(engine)}, i == {i:d}") + ev_times = bench_with(n, times=1, repeat=1, engine=engine) + ev.loc[i, engine] = np.mean(ev_times) + qu_times = bench_subset(n, times=1, repeat=1, engine=engine) + qu.loc[i, engine] = np.mean(qu_times) + + return ev, qu + + +def plot_perf(df, engines, title, filename=None): + from matplotlib.pyplot import figure + + sns.set() + sns.set_palette("Set2") + + fig = figure(figsize=(4, 3), dpi=120) + ax = fig.add_subplot(111) + + for engine in engines: + ax.loglog(df["size"], df[engine], label=engine, lw=2) + + ax.set_xlabel("Number of Rows") + ax.set_ylabel("Time (s)") + ax.set_title(title) + ax.legend(loc="best") + ax.tick_params(top=False, right=False) + + fig.tight_layout() + + if filename is not None: + fig.savefig(filename) + + +if __name__ == "__main__": + import os + + pandas_dir = os.path.dirname( + os.path.dirname(os.path.abspath(os.path.dirname(__file__))) + ) + static_path = os.path.join(pandas_dir, "doc", "source", "_static") + + join = lambda p: os.path.join(static_path, p) + + fn = join("eval-query-perf-data.h5") + + engines = "python", "numexpr" + + ev, qu = bench(verbose=True) # only this one + + plot_perf(ev, engines, "DataFrame.eval()", filename=join("eval-perf.png")) + plot_perf(qu, engines, "DataFrame.query()", filename=join("query-perf.png")) diff --git a/doc/source/_static/banklist.html b/doc/source/_static/banklist.html deleted file mode 100644 index cb07c332acbe7..0000000000000 --- a/doc/source/_static/banklist.html +++ /dev/null @@ -1,4885 +0,0 @@ - - - - -FDIC: Failed Bank List - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip Header -
-
-
- - -
- - -

Federal Deposit
Insurance Corporation

-

Each depositor insured to at least $250,000 per insured bank

-
- -
-
- - - - - - -
- -

Failed Bank List

- -

The FDIC is often appointed as receiver for failed banks. This page contains useful information for the customers and vendors of these banks. This includes information on the acquiring bank (if applicable), how your accounts and loans are affected, and how vendors can file claims against the receivership. Failed Financial Institution Contact Search displays point of contact information related to failed banks.

- -

This list includes banks which have failed since October 1, 2000. To search for banks that failed prior to those on this page, visit this link: Failures and Assistance Transactions

- -

Failed Bank List - CSV file (Updated on Mondays. Also opens in Excel - Excel Help)

- -

Due to the small screen size some information is no longer visible.
Full information available when viewed on a larger screen.

- - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Bank NameCitySTCERTAcquiring InstitutionClosing DateUpdated Date
Banks of Wisconsin d/b/a Bank of KenoshaKenoshaWI35386North Shore Bank, FSBMay 31, 2013May 31, 2013
Central Arizona BankScottsdaleAZ34527Western State BankMay 14, 2013May 20, 2013
Sunrise BankValdostaGA58185Synovus BankMay 10, 2013May 21, 2013
Pisgah Community BankAshevilleNC58701Capital Bank, N.A.May 10, 2013May 14, 2013
Douglas County BankDouglasvilleGA21649Hamilton State BankApril 26, 2013May 16, 2013
Parkway BankLenoirNC57158CertusBank, National AssociationApril 26, 2013May 17, 2013
Chipola Community BankMariannaFL58034First Federal Bank of FloridaApril 19, 2013May 16, 2013
Heritage Bank of North FloridaOrange ParkFL26680FirstAtlantic BankApril 19, 2013May 16, 2013
First Federal BankLexingtonKY29594Your Community BankApril 19, 2013April 23, 2013
Gold Canyon BankGold CanyonAZ58066First Scottsdale Bank, National AssociationApril 5, 2013April 9, 2013
Frontier BankLaGrangeGA16431HeritageBank of the SouthMarch 8, 2013March 26, 2013
Covenant BankChicagoIL22476Liberty Bank and Trust CompanyFebruary 15, 2013March 4, 2013
1st Regents BankAndoverMN57157First Minnesota BankJanuary 18, 2013February 28, 2013
Westside Community BankUniversity PlaceWA33997Sunwest BankJanuary 11, 2013January 24, 2013
Community Bank of the OzarksSunrise BeachMO27331Bank of SullivanDecember 14, 2012January 24, 2013
Hometown Community BankBraseltonGA57928CertusBank, National AssociationNovember 16, 2012January 24, 2013
Citizens First National BankPrincetonIL3731Heartland Bank and Trust CompanyNovember 2, 2012January 24, 2013
Heritage Bank of FloridaLutzFL35009Centennial BankNovember 2, 2012January 24, 2013
NOVA BankBerwynPA27148No AcquirerOctober 26, 2012January 24, 2013
Excel BankSedaliaMO19189Simmons First National BankOctober 19, 2012January 24, 2013
First East Side Savings BankTamaracFL28144Stearns Bank N.A.October 19, 2012January 24, 2013
GulfSouth Private BankDestinFL58073SmartBankOctober 19, 2012January 24, 2013
First United BankCreteIL20685Old Plank Trail Community Bank, National AssociationSeptember 28, 2012November 15, 2012
Truman BankSt. LouisMO27316Simmons First National BankSeptember 14, 2012December 17, 2012
First Commercial BankBloomingtonMN35246Republic Bank & Trust CompanySeptember 7, 2012December 17, 2012
Waukegan Savings BankWaukeganIL28243First Midwest BankAugust 3, 2012October 11, 2012
Jasper Banking CompanyJasperGA16240Stearns Bank N.A.July 27, 2012December 17, 2012
Second Federal Savings and Loan Association of ChicagoChicagoIL27986Hinsdale Bank & Trust CompanyJuly 20, 2012January 14, 2013
Heartland BankLeawoodKS1361Metcalf BankJuly 20, 2012December 17, 2012
First Cherokee State BankWoodstockGA32711Community & Southern BankJuly 20, 2012October 31, 2012
Georgia Trust BankBufordGA57847Community & Southern BankJuly 20, 2012December 17, 2012
The Royal Palm Bank of FloridaNaplesFL57096First National Bank of the Gulf CoastJuly 20, 2012January 7, 2013
Glasgow Savings BankGlasgowMO1056Regional Missouri BankJuly 13, 2012October 11, 2012
Montgomery Bank & TrustAileyGA19498Ameris BankJuly 6, 2012October 31, 2012
The Farmers Bank of LynchburgLynchburgTN1690Clayton Bank and TrustJune 15, 2012October 31, 2012
Security Exchange BankMariettaGA35299Fidelity BankJune 15, 2012October 10, 2012
Putnam State BankPalatkaFL27405Harbor Community BankJune 15, 2012October 10, 2012
Waccamaw BankWhitevilleNC34515First Community BankJune 8, 2012November 8, 2012
Farmers' and Traders' State BankShabbonaIL9257First State BankJune 8, 2012October 10, 2012
Carolina Federal Savings BankCharlestonSC35372Bank of North CarolinaJune 8, 2012October 31, 2012
First Capital BankKingfisherOK416F & M BankJune 8, 2012October 10, 2012
Alabama Trust Bank, National AssociationSylacaugaAL35224Southern States BankMay 18, 2012May 20, 2013
Security Bank, National AssociationNorth LauderdaleFL23156Banesco USAMay 4, 2012October 31, 2012
Palm Desert National BankPalm DesertCA23632Pacific Premier BankApril 27, 2012May 17, 2013
Plantation Federal BankPawleys IslandSC32503First Federal BankApril 27, 2012May 17, 2013
Inter Savings Bank, fsb D/B/A InterBank, fsbMaple GroveMN31495Great Southern BankApril 27, 2012May 17, 2013
HarVest Bank of MarylandGaithersburgMD57766SonabankApril 27, 2012May 17, 2013
Bank of the Eastern ShoreCambridgeMD26759No AcquirerApril 27, 2012October 17, 2012
Fort Lee Federal Savings Bank, FSBFort LeeNJ35527Alma BankApril 20, 2012May 17, 2013
Fidelity BankDearbornMI33883The Huntington National BankMarch 30, 2012May 16, 2013
Premier BankWilmetteIL35419International Bank of ChicagoMarch 23, 2012October 17, 2012
Covenant Bank & TrustRock SpringGA58068Stearns Bank, N.A.March 23, 2012October 31, 2012
New City BankChicagoIL57597No AcquirerMarch 9, 2012October 29, 2012
Global Commerce BankDoravilleGA34046Metro City BankMarch 2, 2012October 31, 2012
Home Savings of AmericaLittle FallsMN29178No AcquirerFebruary 24, 2012December 17, 2012
Central Bank of GeorgiaEllavilleGA5687Ameris BankFebruary 24, 2012August 9, 2012
SCB BankShelbyvilleIN29761First Merchants Bank, National AssociationFebruary 10, 2012March 25, 2013
Charter National Bank and TrustHoffman EstatesIL23187Barrington Bank & Trust Company, National AssociationFebruary 10, 2012March 25, 2013
BankEastKnoxvilleTN19869U.S.Bank National AssociationJanuary 27, 2012March 8, 2013
Patriot Bank MinnesotaForest LakeMN34823First Resource BankJanuary 27, 2012September 12, 2012
Tennessee Commerce BankFranklinTN35296Republic Bank & Trust CompanyJanuary 27, 2012November 20, 2012
First Guaranty Bank and Trust Company of JacksonvilleJacksonvilleFL16579CenterState Bank of Florida, N.A.January 27, 2012September 12, 2012
American Eagle Savings BankBoothwynPA31581Capital Bank, N.A.January 20, 2012January 25, 2013
The First State BankStockbridgeGA19252Hamilton State BankJanuary 20, 2012January 25, 2013
Central Florida State BankBelleviewFL57186CenterState Bank of Florida, N.A.January 20, 2012January 25, 2013
Western National BankPhoenixAZ57917Washington FederalDecember 16, 2011August 13, 2012
Premier Community Bank of the Emerald CoastCrestviewFL58343Summit BankDecember 16, 2011September 12, 2012
Central Progressive BankLacombeLA19657First NBC BankNovember 18, 2011August 13, 2012
Polk County BankJohnstonIA14194Grinnell State BankNovember 18, 2011August 15, 2012
Community Bank of RockmartRockmartGA57860Century Bank of GeorgiaNovember 10, 2011August 13, 2012
SunFirst BankSaint GeorgeUT57087Cache Valley BankNovember 4, 2011November 16, 2012
Mid City Bank, Inc.OmahaNE19397Premier BankNovember 4, 2011August 15, 2012
All American BankDes PlainesIL57759International Bank of ChicagoOctober 28, 2011August 15, 2012
Community Banks of ColoradoGreenwood VillageCO21132Bank Midwest, N.A.October 21, 2011January 2, 2013
Community Capital BankJonesboroGA57036State Bank and Trust CompanyOctober 21, 2011November 8, 2012
Decatur First BankDecaturGA34392Fidelity BankOctober 21, 2011November 8, 2012
Old Harbor BankClearwaterFL575371st United BankOctober 21, 2011November 8, 2012
Country BankAledoIL35395Blackhawk Bank & TrustOctober 14, 2011August 15, 2012
First State BankCranfordNJ58046Northfield BankOctober 14, 2011November 8, 2012
Blue Ridge Savings Bank, Inc.AshevilleNC32347Bank of North CarolinaOctober 14, 2011November 8, 2012
Piedmont Community BankGrayGA57256State Bank and Trust CompanyOctober 14, 2011January 22, 2013
Sun Security BankEllingtonMO20115Great Southern BankOctober 7, 2011November 7, 2012
The RiverBankWyomingMN10216Central BankOctober 7, 2011November 7, 2012
First International BankPlanoTX33513American First National BankSeptember 30, 2011October 9, 2012
Citizens Bank of Northern CaliforniaNevada CityCA33983Tri Counties BankSeptember 23, 2011October 9, 2012
Bank of the CommonwealthNorfolkVA20408Southern Bank and Trust CompanySeptember 23, 2011October 9, 2012
The First National Bank of FloridaMiltonFL25155CharterBankSeptember 9, 2011September 6, 2012
CreekSide BankWoodstockGA58226Georgia Commerce BankSeptember 2, 2011September 6, 2012
Patriot Bank of GeorgiaCummingGA58273Georgia Commerce BankSeptember 2, 2011November 2, 2012
First Choice BankGenevaIL57212Inland Bank & TrustAugust 19, 2011August 15, 2012
First Southern National BankStatesboroGA57239Heritage Bank of the SouthAugust 19, 2011November 2, 2012
Lydian Private BankPalm BeachFL35356Sabadell United Bank, N.A.August 19, 2011November 2, 2012
Public Savings BankHuntingdon ValleyPA34130Capital Bank, N.A.August 18, 2011August 15, 2012
The First National Bank of OlatheOlatheKS4744Enterprise Bank & TrustAugust 12, 2011August 23, 2012
Bank of WhitmanColfaxWA22528Columbia State BankAugust 5, 2011August 16, 2012
Bank of ShorewoodShorewoodIL22637Heartland Bank and Trust CompanyAugust 5, 2011August 16, 2012
Integra Bank National AssociationEvansvilleIN4392Old National BankJuly 29, 2011August 16, 2012
BankMeridian, N.A.ColumbiaSC58222SCBT National AssociationJuly 29, 2011November 2, 2012
Virginia Business BankRichmondVA58283Xenith BankJuly 29, 2011October 9, 2012
Bank of ChoiceGreeleyCO2994Bank Midwest, N.A.July 22, 2011September 12, 2012
LandMark Bank of FloridaSarasotaFL35244American Momentum BankJuly 22, 2011November 2, 2012
Southshore Community BankApollo BeachFL58056American Momentum BankJuly 22, 2011November 2, 2012
Summit BankPrescottAZ57442The Foothills BankJuly 15, 2011August 16, 2012
First Peoples BankPort St. LucieFL34870Premier American Bank, N.A.July 15, 2011November 2, 2012
High Trust BankStockbridgeGA19554Ameris BankJuly 15, 2011November 2, 2012
One Georgia BankAtlantaGA58238Ameris BankJuly 15, 2011November 2, 2012
Signature BankWindsorCO57835Points West Community BankJuly 8, 2011October 26, 2012
Colorado Capital BankCastle RockCO34522First-Citizens Bank & Trust CompanyJuly 8, 2011January 15, 2013
First Chicago Bank & TrustChicagoIL27935Northbrook Bank & Trust CompanyJuly 8, 2011September 9, 2012
Mountain Heritage BankClaytonGA57593First American Bank and Trust CompanyJune 24, 2011November 2, 2012
First Commercial Bank of Tampa BayTampaFL27583Stonegate BankJune 17, 2011November 2, 2012
McIntosh State BankJacksonGA19237Hamilton State BankJune 17, 2011November 2, 2012
Atlantic Bank and TrustCharlestonSC58420First Citizens Bank and Trust Company, Inc.June 3, 2011October 31, 2012
First Heritage BankSnohomishWA23626Columbia State BankMay 27, 2011January 28, 2013
Summit BankBurlingtonWA513Columbia State BankMay 20, 2011January 22, 2013
First Georgia Banking CompanyFranklinGA57647CertusBank, National AssociationMay 20, 2011November 13, 2012
Atlantic Southern BankMaconGA57213CertusBank, National AssociationMay 20, 2011October 31, 2012
Coastal BankCocoa BeachFL34898Florida Community Bank, a division of Premier American Bank, N.A.May 6, 2011November 30, 2012
Community Central BankMount ClemensMI34234Talmer Bank & TrustApril 29, 2011August 16, 2012
The Park Avenue BankValdostaGA19797Bank of the OzarksApril 29, 2011November 30, 2012
First Choice Community BankDallasGA58539Bank of the OzarksApril 29, 2011January 22, 2013
Cortez Community BankBrooksvilleFL57625Florida Community Bank, a division of Premier American Bank, N.A.April 29, 2011November 30, 2012
First National Bank of Central FloridaWinter ParkFL26297Florida Community Bank, a division of Premier American Bank, N.A.April 29, 2011November 30, 2012
Heritage Banking GroupCarthageMS14273Trustmark National BankApril 15, 2011November 30, 2012
Rosemount National BankRosemountMN24099Central BankApril 15, 2011August 16, 2012
Superior BankBirminghamAL17750Superior Bank, National AssociationApril 15, 2011November 30, 2012
Nexity BankBirminghamAL19794AloStar Bank of CommerceApril 15, 2011September 4, 2012
New Horizons BankEast EllijayGA57705Citizens South BankApril 15, 2011August 16, 2012
Bartow County BankCartersvilleGA21495Hamilton State BankApril 15, 2011January 22, 2013
Nevada Commerce BankLas VegasNV35418City National BankApril 8, 2011September 9, 2012
Western Springs National Bank and TrustWestern SpringsIL10086Heartland Bank and Trust CompanyApril 8, 2011January 22, 2013
The Bank of CommerceWood DaleIL34292Advantage National Bank GroupMarch 25, 2011January 22, 2013
Legacy BankMilwaukeeWI34818Seaway Bank and Trust CompanyMarch 11, 2011September 12, 2012
First National Bank of DavisDavisOK4077The Pauls Valley National BankMarch 11, 2011August 20, 2012
Valley Community BankSt. CharlesIL34187First State BankFebruary 25, 2011September 12, 2012
San Luis Trust Bank, FSBSan Luis ObispoCA34783First California BankFebruary 18, 2011August 20, 2012
Charter Oak BankNapaCA57855Bank of MarinFebruary 18, 2011September 12, 2012
Citizens Bank of EffinghamSpringfieldGA34601Heritage Bank of the SouthFebruary 18, 2011November 2, 2012
Habersham BankClarkesvilleGA151SCBT National AssociationFebruary 18, 2011November 2, 2012
Canyon National BankPalm SpringsCA34692Pacific Premier BankFebruary 11, 2011September 12, 2012
Badger State BankCassvilleWI13272Royal BankFebruary 11, 2011September 12, 2012
Peoples State BankHamtramckMI14939First Michigan BankFebruary 11, 2011January 22, 2013
Sunshine State Community BankPort OrangeFL35478Premier American Bank, N.A.February 11, 2011November 2, 2012
Community First Bank ChicagoChicagoIL57948Northbrook Bank & Trust CompanyFebruary 4, 2011August 20, 2012
North Georgia BankWatkinsvilleGA35242BankSouthFebruary 4, 2011November 2, 2012
American Trust BankRoswellGA57432Renasant BankFebruary 4, 2011October 31, 2012
First Community BankTaosNM12261U.S. Bank, N.A.January 28, 2011September 12, 2012
FirsTier BankLouisvilleCO57646No AcquirerJanuary 28, 2011September 12, 2012
Evergreen State BankStoughtonWI5328McFarland State BankJanuary 28, 2011September 12, 2012
The First State BankCamargoOK2303Bank 7January 28, 2011September 12, 2012
United Western BankDenverCO31293First-Citizens Bank & Trust CompanyJanuary 21, 2011September 12, 2012
The Bank of AshevilleAshevilleNC34516First BankJanuary 21, 2011November 2, 2012
CommunitySouth Bank & TrustEasleySC57868CertusBank, National AssociationJanuary 21, 2011November 2, 2012
Enterprise Banking CompanyMcDonoughGA19758No AcquirerJanuary 21, 2011November 2, 2012
Oglethorpe BankBrunswickGA57440Bank of the OzarksJanuary 14, 2011November 2, 2012
Legacy BankScottsdaleAZ57820Enterprise Bank & TrustJanuary 7, 2011September 12, 2012
First Commercial Bank of FloridaOrlandoFL34965First Southern BankJanuary 7, 2011November 2, 2012
Community National BankLino LakesMN23306Farmers & Merchants Savings BankDecember 17, 2010August 20, 2012
First Southern BankBatesvilleAR58052Southern BankDecember 17, 2010August 20, 2012
United Americas Bank, N.A.AtlantaGA35065State Bank and Trust CompanyDecember 17, 2010November 2, 2012
Appalachian Community Bank, FSBMcCaysvilleGA58495Peoples Bank of East TennesseeDecember 17, 2010October 31, 2012
Chestatee State BankDawsonvilleGA34578Bank of the OzarksDecember 17, 2010November 2, 2012
The Bank of Miami,N.A.Coral GablesFL190401st United BankDecember 17, 2010November 2, 2012
Earthstar BankSouthamptonPA35561Polonia BankDecember 10, 2010August 20, 2012
Paramount BankFarmington HillsMI34673Level One BankDecember 10, 2010August 20, 2012
First Banking CenterBurlingtonWI5287First Michigan BankNovember 19, 2010August 20, 2012
Allegiance Bank of North AmericaBala CynwydPA35078VIST BankNovember 19, 2010August 20, 2012
Gulf State Community BankCarrabelleFL20340Centennial BankNovember 19, 2010November 2, 2012
Copper Star BankScottsdaleAZ35463Stearns Bank, N.A.November 12, 2010August 20, 2012
Darby Bank & Trust Co.VidaliaGA14580Ameris BankNovember 12, 2010January 15, 2013
Tifton Banking CompanyTiftonGA57831Ameris BankNovember 12, 2010November 2, 2012
First Vietnamese American Bank
In Vietnamese
WestminsterCA57885Grandpoint BankNovember 5, 2010September 12, 2012
Pierce Commercial BankTacomaWA34411Heritage BankNovember 5, 2010August 20, 2012
Western Commercial BankWoodland HillsCA58087First California BankNovember 5, 2010September 12, 2012
K BankRandallstownMD31263Manufacturers and Traders Trust Company (M&T Bank)November 5, 2010August 20, 2012
First Arizona Savings, A FSBScottsdaleAZ32582No AcquirerOctober 22, 2010August 20, 2012
Hillcrest BankOverland ParkKS22173Hillcrest Bank, N.A.October 22, 2010August 20, 2012
First Suburban National BankMaywoodIL16089Seaway Bank and Trust CompanyOctober 22, 2010August 20, 2012
The First National Bank of BarnesvilleBarnesvilleGA2119United BankOctober 22, 2010November 2, 2012
The Gordon BankGordonGA33904Morris BankOctober 22, 2010November 2, 2012
Progress Bank of FloridaTampaFL32251Bay Cities BankOctober 22, 2010November 2, 2012
First Bank of JacksonvilleJacksonvilleFL27573Ameris BankOctober 22, 2010November 2, 2012
Premier BankJefferson CityMO34016Providence BankOctober 15, 2010August 20, 2012
WestBridge Bank and Trust CompanyChesterfieldMO58205Midland States BankOctober 15, 2010August 20, 2012
Security Savings Bank, F.S.B.OlatheKS30898Simmons First National BankOctober 15, 2010August 20, 2012
Shoreline BankShorelineWA35250GBC International BankOctober 1, 2010August 20, 2012
Wakulla BankCrawfordvilleFL21777Centennial BankOctober 1, 2010November 2, 2012
North County BankArlingtonWA35053Whidbey Island BankSeptember 24, 2010August 20, 2012
Haven Trust Bank FloridaPonte Vedra BeachFL58308First Southern BankSeptember 24, 2010November 5, 2012
Maritime Savings BankWest AllisWI28612North Shore Bank, FSBSeptember 17, 2010August 20, 2012
Bramble Savings BankMilfordOH27808Foundation BankSeptember 17, 2010August 20, 2012
The Peoples BankWinderGA182Community & Southern BankSeptember 17, 2010November 5, 2012
First Commerce Community BankDouglasvilleGA57448Community & Southern BankSeptember 17, 2010January 15, 2013
Bank of EllijayEllijayGA58197Community & Southern BankSeptember 17, 2010January 15, 2013
ISN BankCherry HillNJ57107Customers BankSeptember 17, 2010August 22, 2012
Horizon BankBradentonFL35061Bank of the OzarksSeptember 10, 2010November 5, 2012
Sonoma Valley BankSonomaCA27259Westamerica BankAugust 20, 2010September 12, 2012
Los Padres BankSolvangCA32165Pacific Western BankAugust 20, 2010September 12, 2012
Butte Community BankChicoCA33219Rabobank, N.A.August 20, 2010September 12, 2012
Pacific State BankStocktonCA27090Rabobank, N.A.August 20, 2010September 12, 2012
ShoreBankChicagoIL15640Urban Partnership BankAugust 20, 2010May 16, 2013
Imperial Savings and Loan AssociationMartinsvilleVA31623River Community Bank, N.A.August 20, 2010August 24, 2012
Independent National BankOcalaFL27344CenterState Bank of Florida, N.A.August 20, 2010November 5, 2012
Community National Bank at BartowBartowFL25266CenterState Bank of Florida, N.A.August 20, 2010November 5, 2012
Palos Bank and Trust CompanyPalos HeightsIL17599First Midwest BankAugust 13, 2010August 22, 2012
Ravenswood BankChicagoIL34231Northbrook Bank & Trust CompanyAugust 6, 2010August 22, 2012
LibertyBankEugeneOR31964Home Federal BankJuly 30, 2010August 22, 2012
The Cowlitz BankLongviewWA22643Heritage BankJuly 30, 2010August 22, 2012
Coastal Community BankPanama City BeachFL9619Centennial BankJuly 30, 2010November 5, 2012
Bayside Savings BankPort Saint JoeFL57669Centennial BankJuly 30, 2010November 5, 2012
Northwest Bank & TrustAcworthGA57658State Bank and Trust CompanyJuly 30, 2010November 5, 2012
Home Valley BankCave JunctionOR23181South Valley Bank & TrustJuly 23, 2010September 12, 2012
SouthwestUSA BankLas VegasNV35434Plaza BankJuly 23, 2010August 22, 2012
Community Security BankNew PragueMN34486RoundbankJuly 23, 2010September 12, 2012
Thunder BankSylvan GroveKS10506The Bennington State BankJuly 23, 2010September 13, 2012
Williamsburg First National BankKingstreeSC17837First Citizens Bank and Trust Company, Inc.July 23, 2010November 5, 2012
Crescent Bank and Trust CompanyJasperGA27559Renasant BankJuly 23, 2010November 5, 2012
Sterling BankLantanaFL32536IBERIABANKJuly 23, 2010November 5, 2012
Mainstreet Savings Bank, FSBHastingsMI28136Commercial BankJuly 16, 2010September 13, 2012
Olde Cypress Community BankClewistonFL28864CenterState Bank of Florida, N.A.July 16, 2010November 5, 2012
Turnberry BankAventuraFL32280NAFH National BankJuly 16, 2010November 5, 2012
Metro Bank of Dade CountyMiamiFL25172NAFH National BankJuly 16, 2010November 5, 2012
First National Bank of the SouthSpartanburgSC35383NAFH National BankJuly 16, 2010November 5, 2012
Woodlands BankBlufftonSC32571Bank of the OzarksJuly 16, 2010November 5, 2012
Home National BankBlackwellOK11636RCB BankJuly 9, 2010December 10, 2012
USA BankPort ChesterNY58072New Century BankJuly 9, 2010September 14, 2012
Ideal Federal Savings BankBaltimoreMD32456No AcquirerJuly 9, 2010September 14, 2012
Bay National BankBaltimoreMD35462Bay Bank, FSBJuly 9, 2010January 15, 2013
High Desert State BankAlbuquerqueNM35279First American BankJune 25, 2010September 14, 2012
First National BankSavannahGA34152The Savannah Bank, N.A.June 25, 2010November 5, 2012
Peninsula BankEnglewoodFL26563Premier American Bank, N.A.June 25, 2010November 5, 2012
Nevada Security BankRenoNV57110Umpqua BankJune 18, 2010August 23, 2012
Washington First International BankSeattleWA32955East West BankJune 11, 2010September 14, 2012
TierOne BankLincolnNE29341Great Western BankJune 4, 2010September 14, 2012
Arcola Homestead Savings BankArcolaIL31813No AcquirerJune 4, 2010September 14, 2012
First National BankRosedaleMS15814The Jefferson BankJune 4, 2010November 5, 2012
Sun West BankLas VegasNV34785City National BankMay 28, 2010September 14, 2012
Granite Community Bank, NAGranite BayCA57315Tri Counties BankMay 28, 2010September 14, 2012
Bank of Florida - TampaTampaFL57814EverBankMay 28, 2010November 5, 2012
Bank of Florida - SouthwestNaplesFL35106EverBankMay 28, 2010November 5, 2012
Bank of Florida - SoutheastFort LauderdaleFL57360EverBankMay 28, 2010November 5, 2012
Pinehurst BankSaint PaulMN57735Coulee BankMay 21, 2010October 26, 2012
Midwest Bank and Trust CompanyElmwood ParkIL18117FirstMerit Bank, N.A.May 14, 2010August 23, 2012
Southwest Community BankSpringfieldMO34255Simmons First National BankMay 14, 2010August 23, 2012
New Liberty BankPlymouthMI35586Bank of Ann ArborMay 14, 2010August 23, 2012
Satilla Community BankSaint MarysGA35114Ameris BankMay 14, 2010November 5, 2012
1st Pacific Bank of CaliforniaSan DiegoCA35517City National BankMay 7, 2010December 13, 2012
Towne Bank of ArizonaMesaAZ57697Commerce Bank of ArizonaMay 7, 2010August 23, 2012
Access BankChamplinMN16476PrinsBankMay 7, 2010August 23, 2012
The Bank of BonifayBonifayFL14246First Federal Bank of FloridaMay 7, 2010November 5, 2012
Frontier BankEverettWA22710Union Bank, N.A.April 30, 2010January 15, 2013
BC National BanksButlerMO17792Community First BankApril 30, 2010August 23, 2012
Champion BankCreve CoeurMO58362BankLibertyApril 30, 2010August 23, 2012
CF BancorpPort HuronMI30005First Michigan BankApril 30, 2010January 15, 2013
Westernbank Puerto Rico
En Espanol
MayaguezPR31027Banco Popular de Puerto RicoApril 30, 2010November 5, 2012
R-G Premier Bank of Puerto Rico
En Espanol
Hato ReyPR32185Scotiabank de Puerto RicoApril 30, 2010November 5, 2012
Eurobank
En Espanol
San JuanPR27150Oriental Bank and TrustApril 30, 2010November 5, 2012
Wheatland BankNapervilleIL58429Wheaton Bank & TrustApril 23, 2010August 23, 2012
Peotone Bank and Trust CompanyPeotoneIL10888First Midwest BankApril 23, 2010August 23, 2012
Lincoln Park Savings BankChicagoIL30600Northbrook Bank & Trust CompanyApril 23, 2010August 23, 2012
New Century BankChicagoIL34821MB Financial Bank, N.A.April 23, 2010August 23, 2012
Citizens Bank and Trust Company of ChicagoChicagoIL34658Republic Bank of ChicagoApril 23, 2010August 23, 2012
Broadway BankChicagoIL22853MB Financial Bank, N.A.April 23, 2010August 23, 2012
Amcore Bank, National AssociationRockfordIL3735Harris N.A.April 23, 2010August 23, 2012
City BankLynnwoodWA21521Whidbey Island BankApril 16, 2010September 14, 2012
Tamalpais BankSan RafaelCA33493Union Bank, N.A.April 16, 2010August 23, 2012
Innovative BankOaklandCA23876Center BankApril 16, 2010August 23, 2012
Butler BankLowellMA26619People's United BankApril 16, 2010August 23, 2012
Riverside National Bank of FloridaFort PierceFL24067TD Bank, N.A.April 16, 2010November 5, 2012
AmericanFirst BankClermontFL57724TD Bank, N.A.April 16, 2010October 31, 2012
First Federal Bank of North FloridaPalatkaFL28886TD Bank, N.A.April 16, 2010January 15, 2013
Lakeside Community BankSterling HeightsMI34878No AcquirerApril 16, 2010August 23, 2012
Beach First National BankMyrtle BeachSC34242Bank of North CarolinaApril 9, 2010November 5, 2012
Desert Hills BankPhoenixAZ57060New York Community BankMarch 26, 2010August 23, 2012
Unity National BankCartersvilleGA34678Bank of the OzarksMarch 26, 2010September 14, 2012
Key West BankKey WestFL34684Centennial BankMarch 26, 2010August 23, 2012
McIntosh Commercial BankCarrolltonGA57399CharterBankMarch 26, 2010August 23, 2012
State Bank of AuroraAuroraMN8221Northern State BankMarch 19, 2010August 23, 2012
First Lowndes BankFort DepositAL24957First Citizens BankMarch 19, 2010August 23, 2012
Bank of HiawasseeHiawasseeGA10054Citizens South BankMarch 19, 2010August 23, 2012
Appalachian Community BankEllijayGA33989Community & Southern BankMarch 19, 2010October 31, 2012
Advanta Bank Corp.DraperUT33535No AcquirerMarch 19, 2010September 14, 2012
Century Security BankDuluthGA58104Bank of UpsonMarch 19, 2010August 23, 2012
American National BankParmaOH18806The National Bank and Trust CompanyMarch 19, 2010August 23, 2012
Statewide BankCovingtonLA29561Home BankMarch 12, 2010August 23, 2012
Old Southern BankOrlandoFL58182Centennial BankMarch 12, 2010August 23, 2012
The Park Avenue BankNew YorkNY27096Valley National BankMarch 12, 2010August 23, 2012
LibertyPointe BankNew YorkNY58071Valley National BankMarch 11, 2010August 23, 2012
Centennial BankOgdenUT34430No AcquirerMarch 5, 2010September 14, 2012
Waterfield BankGermantownMD34976No AcquirerMarch 5, 2010August 23, 2012
Bank of IllinoisNormalIL9268Heartland Bank and Trust CompanyMarch 5, 2010August 23, 2012
Sun American BankBoca RatonFL27126First-Citizens Bank & Trust CompanyMarch 5, 2010August 23, 2012
Rainier Pacific BankTacomaWA38129Umpqua BankFebruary 26, 2010August 23, 2012
Carson River Community BankCarson CityNV58352Heritage Bank of NevadaFebruary 26, 2010January 15, 2013
La Jolla Bank, FSBLa JollaCA32423OneWest Bank, FSBFebruary 19, 2010August 24, 2012
George Washington Savings BankOrland ParkIL29952FirstMerit Bank, N.A.February 19, 2010August 24, 2012
The La Coste National BankLa CosteTX3287Community National BankFebruary 19, 2010September 14, 2012
Marco Community BankMarco IslandFL57586Mutual of Omaha BankFebruary 19, 2010August 24, 2012
1st American State Bank of MinnesotaHancockMN15448Community Development Bank, FSBFebruary 5, 2010August 24, 2012
American Marine BankBainbridge IslandWA16730Columbia State BankJanuary 29, 2010August 24, 2012
First Regional BankLos AngelesCA23011First-Citizens Bank & Trust CompanyJanuary 29, 2010August 24, 2012
Community Bank and TrustCorneliaGA5702SCBT National AssociationJanuary 29, 2010January 15, 2013
Marshall Bank, N.A.HallockMN16133United Valley BankJanuary 29, 2010August 23, 2012
Florida Community BankImmokaleeFL5672Premier American Bank, N.A.January 29, 2010January 15, 2013
First National Bank of GeorgiaCarrolltonGA16480Community & Southern BankJanuary 29, 2010December 13, 2012
Columbia River BankThe DallesOR22469Columbia State BankJanuary 22, 2010September 14, 2012
Evergreen BankSeattleWA20501Umpqua BankJanuary 22, 2010January 15, 2013
Charter BankSanta FeNM32498Charter BankJanuary 22, 2010August 23, 2012
Bank of LeetonLeetonMO8265Sunflower Bank, N.A.January 22, 2010January 15, 2013
Premier American BankMiamiFL57147Premier American Bank, N.A.January 22, 2010December 13, 2012
Barnes Banking CompanyKaysvilleUT1252No AcquirerJanuary 15, 2010August 23, 2012
St. Stephen State BankSt. StephenMN17522First State Bank of St. JosephJanuary 15, 2010August 23, 2012
Town Community Bank & TrustAntiochIL34705First American BankJanuary 15, 2010August 23, 2012
Horizon BankBellinghamWA22977Washington Federal Savings and Loan AssociationJanuary 8, 2010August 23, 2012
First Federal Bank of California, F.S.B.Santa MonicaCA28536OneWest Bank, FSBDecember 18, 2009August 23, 2012
Imperial Capital BankLa JollaCA26348City National BankDecember 18, 2009September 5, 2012
Independent Bankers' BankSpringfieldIL26820The Independent BankersBank (TIB)December 18, 2009August 23, 2012
New South Federal Savings BankIrondaleAL32276Beal BankDecember 18, 2009August 23, 2012
Citizens State BankNew BaltimoreMI1006No AcquirerDecember 18, 2009November 5, 2012
Peoples First Community BankPanama CityFL32167Hancock BankDecember 18, 2009November 5, 2012
RockBridge Commercial BankAtlantaGA58315No AcquirerDecember 18, 2009November 5, 2012
SolutionsBankOverland ParkKS4731Arvest BankDecember 11, 2009August 23, 2012
Valley Capital Bank, N.A.MesaAZ58399Enterprise Bank & TrustDecember 11, 2009August 23, 2012
Republic Federal Bank, N.A.MiamiFL228461st United BankDecember 11, 2009November 5, 2012
Greater Atlantic BankRestonVA32583SonabankDecember 4, 2009November 5, 2012
Benchmark BankAuroraIL10440MB Financial Bank, N.A.December 4, 2009August 23, 2012
AmTrust BankClevelandOH29776New York Community BankDecember 4, 2009November 5, 2012
The Tattnall BankReidsvilleGA12080Heritage Bank of the SouthDecember 4, 2009November 5, 2012
First Security National BankNorcrossGA26290State Bank and Trust CompanyDecember 4, 2009November 5, 2012
The Buckhead Community BankAtlantaGA34663State Bank and Trust CompanyDecember 4, 2009November 5, 2012
Commerce Bank of Southwest FloridaFort MyersFL58016Central BankNovember 20, 2009November 5, 2012
Pacific Coast National BankSan ClementeCA57914Sunwest BankNovember 13, 2009August 22, 2012
Orion BankNaplesFL22427IBERIABANKNovember 13, 2009November 5, 2012
Century Bank, F.S.B.SarasotaFL32267IBERIABANKNovember 13, 2009August 22, 2012
United Commercial BankSan FranciscoCA32469East West BankNovember 6, 2009November 5, 2012
Gateway Bank of St. LouisSt. LouisMO19450Central Bank of Kansas CityNovember 6, 2009August 22, 2012
Prosperan BankOakdaleMN35074Alerus Financial, N.A.November 6, 2009August 22, 2012
Home Federal Savings BankDetroitMI30329Liberty Bank and Trust CompanyNovember 6, 2009August 22, 2012
United Security BankSpartaGA22286Ameris BankNovember 6, 2009January 15, 2013
North Houston BankHoustonTX18776U.S. Bank N.A.October 30, 2009August 22, 2012
Madisonville State BankMadisonvilleTX33782U.S. Bank N.A.October 30, 2009August 22, 2012
Citizens National BankTeagueTX25222U.S. Bank N.A.October 30, 2009August 22, 2012
Park National BankChicagoIL11677U.S. Bank N.A.October 30, 2009August 22, 2012
Pacific National BankSan FranciscoCA30006U.S. Bank N.A.October 30, 2009August 22, 2012
California National BankLos AngelesCA34659U.S. Bank N.A.October 30, 2009September 5, 2012
San Diego National BankSan DiegoCA23594U.S. Bank N.A.October 30, 2009August 22, 2012
Community Bank of LemontLemontIL35291U.S. Bank N.A.October 30, 2009January 15, 2013
Bank USA, N.A.PhoenixAZ32218U.S. Bank N.A.October 30, 2009August 22, 2012
First DuPage BankWestmontIL35038First Midwest BankOctober 23, 2009August 22, 2012
Riverview Community BankOtsegoMN57525Central BankOctober 23, 2009August 22, 2012
Bank of ElmwoodRacineWI18321Tri City National BankOctober 23, 2009August 22, 2012
Flagship National BankBradentonFL35044First Federal Bank of FloridaOctober 23, 2009August 22, 2012
Hillcrest Bank FloridaNaplesFL58336Stonegate BankOctober 23, 2009August 22, 2012
American United BankLawrencevilleGA57794Ameris BankOctober 23, 2009September 5, 2012
Partners BankNaplesFL57959Stonegate BankOctober 23, 2009January 15, 2013
San Joaquin BankBakersfieldCA23266Citizens Business BankOctober 16, 2009August 22, 2012
Southern Colorado National BankPuebloCO57263Legacy BankOctober 2, 2009September 5, 2012
Jennings State BankSpring GroveMN11416Central BankOctober 2, 2009August 21, 2012
Warren BankWarrenMI34824The Huntington National BankOctober 2, 2009August 21, 2012
Georgian BankAtlantaGA57151First Citizens Bank and Trust Company, Inc.September 25, 2009August 21, 2012
Irwin Union Bank, F.S.B.LouisvilleKY57068First Financial Bank, N.A.September 18, 2009September 5, 2012
Irwin Union Bank and Trust CompanyColumbusIN10100First Financial Bank, N.A.September 18, 2009August 21, 2012
Venture BankLaceyWA22868First-Citizens Bank & Trust CompanySeptember 11, 2009August 21, 2012
Brickwell Community BankWoodburyMN57736CorTrust Bank N.A.September 11, 2009January 15, 2013
Corus Bank, N.A.ChicagoIL13693MB Financial Bank, N.A.September 11, 2009August 21, 2012
First State BankFlagstaffAZ34875Sunwest BankSeptember 4, 2009January 15, 2013
Platinum Community BankRolling MeadowsIL35030No AcquirerSeptember 4, 2009August 21, 2012
Vantus BankSioux CityIN27732Great Southern BankSeptember 4, 2009August 21, 2012
InBankOak ForestIL20203MB Financial Bank, N.A.September 4, 2009August 21, 2012
First Bank of Kansas CityKansas CityMO25231Great American BankSeptember 4, 2009August 21, 2012
Affinity BankVenturaCA27197Pacific Western BankAugust 28, 2009August 21, 2012
Mainstreet BankForest LakeMN1909Central BankAugust 28, 2009August 21, 2012
Bradford BankBaltimoreMD28312Manufacturers and Traders Trust Company (M&T Bank)August 28, 2009January 15, 2013
Guaranty BankAustinTX32618BBVA CompassAugust 21, 2009August 21, 2012
CapitalSouth BankBirminghamAL22130IBERIABANKAugust 21, 2009January 15, 2013
First Coweta BankNewnanGA57702United BankAugust 21, 2009January 15, 2013
ebankAtlantaGA34682Stearns Bank, N.A.August 21, 2009August 21, 2012
Community Bank of NevadaLas VegasNV34043No AcquirerAugust 14, 2009August 21, 2012
Community Bank of ArizonaPhoenixAZ57645MidFirst BankAugust 14, 2009August 21, 2012
Union Bank, National AssociationGilbertAZ34485MidFirst BankAugust 14, 2009August 21, 2012
Colonial BankMontgomeryAL9609Branch Banking & Trust Company, (BB&T)August 14, 2009September 5, 2012
Dwelling House Savings and Loan AssociationPittsburghPA31559PNC Bank, N.A.August 14, 2009January 15, 2013
Community First BankPrinevilleOR23268Home Federal BankAugust 7, 2009January 15, 2013
Community National Bank of Sarasota CountyVeniceFL27183Stearns Bank, N.A.August 7, 2009August 20, 2012
First State BankSarasotaFL27364Stearns Bank, N.A.August 7, 2009August 20, 2012
Mutual BankHarveyIL18659United Central BankJuly 31, 2009August 20, 2012
First BankAmericanoElizabethNJ34270Crown BankJuly 31, 2009August 20, 2012
Peoples Community BankWest ChesterOH32288First Financial Bank, N.A.July 31, 2009August 20, 2012
Integrity BankJupiterFL57604Stonegate BankJuly 31, 2009August 20, 2012
First State Bank of AltusAltusOK9873Herring BankJuly 31, 2009August 20, 2012
Security Bank of Jones CountyGrayGA8486State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of Houston CountyPerryGA27048State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of Bibb CountyMaconGA27367State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of North MetroWoodstockGA57105State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of North FultonAlpharettaGA57430State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of Gwinnett CountySuwaneeGA57346State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Waterford Village BankWilliamsvilleNY58065Evans Bank, N.A.July 24, 2009August 20, 2012
Temecula Valley BankTemeculaCA34341First-Citizens Bank & Trust CompanyJuly 17, 2009August 20, 2012
Vineyard BankRancho CucamongaCA23556California Bank & TrustJuly 17, 2009August 20, 2012
BankFirstSioux FallsSD34103Alerus Financial, N.A.July 17, 2009August 20, 2012
First Piedmont BankWinderGA34594First American Bank and Trust CompanyJuly 17, 2009January 15, 2013
Bank of WyomingThermopolisWY22754Central Bank & TrustJuly 10, 2009August 20, 2012
Founders BankWorthIL18390The PrivateBank and Trust CompanyJuly 2, 2009August 20, 2012
Millennium State Bank of TexasDallasTX57667State Bank of TexasJuly 2, 2009October 26, 2012
First National Bank of DanvilleDanvilleIL3644First Financial Bank, N.A.July 2, 2009August 20, 2012
Elizabeth State BankElizabethIL9262Galena State Bank and Trust CompanyJuly 2, 2009August 20, 2012
Rock River BankOregonIL15302The Harvard State BankJuly 2, 2009August 20, 2012
First State Bank of WinchesterWinchesterIL11710The First National Bank of BeardstownJuly 2, 2009August 20, 2012
John Warner BankClintonIL12093State Bank of LincolnJuly 2, 2009August 20, 2012
Mirae BankLos AngelesCA57332Wilshire State BankJune 26, 2009August 20, 2012
MetroPacific BankIrvineCA57893Sunwest BankJune 26, 2009August 20, 2012
Horizon BankPine CityMN9744Stearns Bank, N.A.June 26, 2009August 20, 2012
Neighborhood Community BankNewnanGA35285CharterBankJune 26, 2009August 20, 2012
Community Bank of West GeorgiaVilla RicaGA57436No AcquirerJune 26, 2009August 17, 2012
First National Bank of AnthonyAnthonyKS4614Bank of KansasJune 19, 2009August 17, 2012
Cooperative BankWilmingtonNC27837First BankJune 19, 2009August 17, 2012
Southern Community BankFayettevilleGA35251United Community BankJune 19, 2009August 17, 2012
Bank of LincolnwoodLincolnwoodIL17309Republic Bank of ChicagoJune 5, 2009August 17, 2012
Citizens National BankMacombIL5757Morton Community BankMay 22, 2009September 4, 2012
Strategic Capital BankChampaignIL35175Midland States BankMay 22, 2009September 4, 2012
BankUnited, FSBCoral GablesFL32247BankUnitedMay 21, 2009August 17, 2012
Westsound BankBremertonWA34843Kitsap BankMay 8, 2009September 4, 2012
America West BankLaytonUT35461Cache Valley BankMay 1, 2009August 17, 2012
Citizens Community BankRidgewoodNJ57563North Jersey Community BankMay 1, 2009September 4, 2012
Silverton Bank, NAAtlantaGA26535No AcquirerMay 1, 2009August 17, 2012
First Bank of IdahoKetchumID34396U.S. Bank, N.A.April 24, 2009August 17, 2012
First Bank of Beverly HillsCalabasasCA32069No AcquirerApril 24, 2009September 4, 2012
Michigan Heritage BankFarmington HillsMI34369Level One BankApril 24, 2009August 17, 2012
American Southern BankKennesawGA57943Bank of North GeorgiaApril 24, 2009August 17, 2012
Great Basin Bank of NevadaElkoNV33824Nevada State BankApril 17, 2009September 4, 2012
American Sterling BankSugar CreekMO8266Metcalf BankApril 17, 2009August 31, 2012
New Frontier BankGreeleyCO34881No AcquirerApril 10, 2009September 4, 2012
Cape Fear BankWilmingtonNC34639First Federal Savings and Loan AssociationApril 10, 2009August 17, 2012
Omni National BankAtlantaGA22238No AcquirerMarch 27, 2009August 17, 2012
TeamBank, NAPaolaKS4754Great Southern BankMarch 20, 2009August 17, 2012
Colorado National BankColorado SpringsCO18896Herring BankMarch 20, 2009August 17, 2012
FirstCity BankStockbridgeGA18243No AcquirerMarch 20, 2009August 17, 2012
Freedom Bank of GeorgiaCommerceGA57558Northeast Georgia BankMarch 6, 2009August 17, 2012
Security Savings BankHendersonNV34820Bank of NevadaFebruary 27, 2009September 7, 2012
Heritage Community BankGlenwoodIL20078MB Financial Bank, N.A.February 27, 2009August 17, 2012
Silver Falls BankSilvertonOR35399Citizens BankFebruary 20, 2009August 17, 2012
Pinnacle Bank of OregonBeavertonOR57342Washington Trust Bank of SpokaneFebruary 13, 2009August 17, 2012
Corn Belt Bank & Trust Co.PittsfieldIL16500The Carlinville National BankFebruary 13, 2009August 17, 2012
Riverside Bank of the Gulf CoastCape CoralFL34563TIB BankFebruary 13, 2009August 17, 2012
Sherman County BankLoup CityNE5431Heritage BankFebruary 13, 2009August 17, 2012
County BankMercedCA22574Westamerica BankFebruary 6, 2009September 4, 2012
Alliance BankCulver CityCA23124California Bank & TrustFebruary 6, 2009August 16, 2012
FirstBank Financial ServicesMcDonoughGA57017Regions BankFebruary 6, 2009August 16, 2012
Ocala National BankOcalaFL26538CenterState Bank of Florida, N.A.January 30, 2009September 4, 2012
Suburban FSBCroftonMD30763Bank of EssexJanuary 30, 2009August 16, 2012
MagnetBankSalt Lake CityUT58001No AcquirerJanuary 30, 2009August 16, 2012
1st Centennial BankRedlandsCA33025First California BankJanuary 23, 2009August 16, 2012
Bank of Clark CountyVancouverWA34959Umpqua BankJanuary 16, 2009August 16, 2012
National Bank of CommerceBerkeleyIL19733Republic Bank of ChicagoJanuary 16, 2009August 16, 2012
Sanderson State Bank
En Espanol
SandersonTX11568The Pecos County State BankDecember 12, 2008September 4, 2012
Haven Trust BankDuluthGA35379Branch Banking & Trust Company, (BB&T)December 12, 2008August 16, 2012
First Georgia Community BankJacksonGA34301United BankDecember 5, 2008August 16, 2012
PFF Bank & TrustPomonaCA28344U.S. Bank, N.A.November 21, 2008January 4, 2013
Downey Savings & LoanNewport BeachCA30968U.S. Bank, N.A.November 21, 2008January 4, 2013
Community BankLoganvilleGA16490Bank of EssexNovember 21, 2008September 4, 2012
Security Pacific BankLos AngelesCA23595Pacific Western BankNovember 7, 2008August 28, 2012
Franklin Bank, SSBHoustonTX26870Prosperity BankNovember 7, 2008August 16, 2012
Freedom BankBradentonFL57930Fifth Third BankOctober 31, 2008August 16, 2012
Alpha Bank & TrustAlpharettaGA58241Stearns Bank, N.A.October 24, 2008August 16, 2012
Meridian BankEldredIL13789National BankOctober 10, 2008May 31, 2012
Main Street BankNorthvilleMI57654Monroe Bank & TrustOctober 10, 2008August 16, 2012
Washington Mutual Bank
(Including its subsidiary Washington Mutual Bank FSB)
HendersonNV32633JP Morgan Chase BankSeptember 25, 2008August 16, 2012
AmeribankNorthforkWV6782The Citizens Savings Bank

Pioneer Community Bank, Inc.
September 19, 2008August 16, 2012
Silver State Bank
En Espanol
HendersonNV34194Nevada State BankSeptember 5, 2008August 16, 2012
Integrity BankAlpharettaGA35469Regions BankAugust 29, 2008August 16, 2012
Columbian Bank & TrustTopekaKS22728Citizens Bank & TrustAugust 22, 2008August 16, 2012
First Priority BankBradentonFL57523SunTrust BankAugust 1, 2008August 16, 2012
First Heritage Bank, NANewport BeachCA57961Mutual of Omaha BankJuly 25, 2008August 28, 2012
First National Bank of NevadaRenoNV27011Mutual of Omaha BankJuly 25, 2008August 28, 2012
IndyMac BankPasadenaCA29730OneWest Bank, FSBJuly 11, 2008August 28, 2012
First Integrity Bank, NAStaplesMN12736First International Bank and TrustMay 30, 2008August 28, 2012
ANB Financial, NABentonvilleAR33901Pulaski Bank and Trust CompanyMay 9, 2008August 28, 2012
Hume BankHumeMO1971Security BankMarch 7, 2008August 28, 2012
Douglass National BankKansas CityMO24660Liberty Bank and Trust CompanyJanuary 25, 2008October 26, 2012
Miami Valley BankLakeviewOH16848The Citizens Banking CompanyOctober 4, 2007August 28, 2012
NetBankAlpharettaGA32575ING DIRECTSeptember 28, 2007August 28, 2012
Metropolitan Savings BankPittsburghPA35353Allegheny Valley Bank of PittsburghFebruary 2, 2007October 27, 2010
Bank of EphraimEphraimUT1249Far West BankJune 25, 2004April 9, 2008
Reliance BankWhite PlainsNY26778Union State BankMarch 19, 2004April 9, 2008
Guaranty National Bank of TallahasseeTallahasseeFL26838Hancock Bank of FloridaMarch 12, 2004June 5, 2012
Dollar Savings BankNewarkNJ31330No AcquirerFebruary 14, 2004April 9, 2008
Pulaski Savings BankPhiladelphiaPA27203Earthstar BankNovember 14, 2003July 22, 2005
First National Bank of BlanchardvilleBlanchardvilleWI11639The Park BankMay 9, 2003June 5, 2012
Southern Pacific BankTorranceCA27094Beal BankFebruary 7, 2003October 20, 2008
Farmers Bank of CheneyvilleCheneyvilleLA16445Sabine State Bank & TrustDecember 17, 2002October 20, 2004
Bank of AlamoAlamoTN9961No AcquirerNovember 8, 2002March 18, 2005
AmTrade International Bank
En Espanol
AtlantaGA33784No AcquirerSeptember 30, 2002September 11, 2006
Universal Federal Savings BankChicagoIL29355Chicago Community BankJune 27, 2002April 9, 2008
Connecticut Bank of CommerceStamfordCT19183Hudson United BankJune 26, 2002February 14, 2012
New Century BankShelby TownshipMI34979No AcquirerMarch 28, 2002March 18, 2005
Net 1st National BankBoca RatonFL26652Bank Leumi USAMarch 1, 2002April 9, 2008
NextBank, NAPhoenixAZ22314No AcquirerFebruary 7, 2002August 27, 2010
Oakwood Deposit Bank Co.OakwoodOH8966The State Bank & Trust CompanyFebruary 1, 2002October 25, 2012
Bank of Sierra BlancaSierra BlancaTX22002The Security State Bank of PecosJanuary 18, 2002November 6, 2003
Hamilton Bank, NA
En Espanol
MiamiFL24382Israel Discount Bank of New YorkJanuary 11, 2002June 5, 2012
Sinclair National BankGravetteAR34248Delta Trust & BankSeptember 7, 2001February 10, 2004
Superior Bank, FSBHinsdaleIL32646Superior Federal, FSBJuly 27, 2001June 5, 2012
Malta National BankMaltaOH6629North Valley BankMay 3, 2001November 18, 2002
First Alliance Bank & Trust Co.ManchesterNH34264Southern New Hampshire Bank & TrustFebruary 2, 2001February 18, 2003
National State Bank of MetropolisMetropolisIL3815Banterra Bank of MarionDecember 14, 2000March 17, 2005
Bank of HonoluluHonoluluHI21029Bank of the OrientOctober 13, 2000March 17, 2005
-
- -
- - - - - - - - - - - - - - - - - - diff --git a/doc/source/_static/ci.png b/doc/source/_static/ci.png index 3a4225e3ce1eb..4754dc2945db5 100644 Binary files a/doc/source/_static/ci.png and b/doc/source/_static/ci.png differ diff --git a/doc/source/_static/css/getting_started.css b/doc/source/_static/css/getting_started.css index bb24761cdb159..e4c5964259349 100644 --- a/doc/source/_static/css/getting_started.css +++ b/doc/source/_static/css/getting_started.css @@ -10,6 +10,14 @@ font-size: 0.9rem; } +.gs-data-header { + background-color: var(--pst-color-on-surface); +} + +.gs-data-list { + background-color: var(--pst-color-on-background); +} + .gs-data-title .badge { margin: 10px; padding: 5px; @@ -57,45 +65,33 @@ margin-top: -5px; } .gs-callout-remember { - border-left-color: #f0ad4e; + border-left-color: var(--pst-color-secondary); align-items: center; font-size: 1.2rem; } .gs-callout-remember h4 { - color: #f0ad4e; + color: var(--pst-color-secondary); } /* reference to user guide */ .gs-torefguide { align-items: center; font-size: 0.9rem; + background-color: var(--pst-color-on-background); + border-radius: .25rem; + padding: 2px; } .gs-torefguide .badge { - background-color: #130654; - margin: 10px 10px 10px 0px; + background-color: var(--pst-color-primary); + margin: 10px 10px 10px 10px; padding: 5px; } -.gs-torefguide a { - margin-left: 5px; - color: #130654; - border-bottom: 1px solid #FFCA00f3; - box-shadow: 0px -10px 0px #FFCA00f3 inset; -} - .gs-torefguide p { margin-top: 1rem; } -.gs-torefguide a:hover { - margin-left: 5px; - color: grey; - text-decoration: none; - border-bottom: 1px solid #b2ff80f3; - box-shadow: 0px -10px 0px #b2ff80f3 inset; -} - /* question-task environment */ ul.task-bullet, ol.custom-bullet{ @@ -113,14 +109,14 @@ ul.task-bullet > li:before { margin-left:-2em; background-position:center; background-repeat:no-repeat; - background-color: #130654; + background-color: var(--pst-color-primary); border-radius: 50%; background-size:100%; background-image:url('https://github.com/question_mark_noback.svg'); } ul.task-bullet > li { - border-left: 1px solid #130654; + border-left: 1px solid var(--pst-color-primary); padding-left:1em; } @@ -131,20 +127,31 @@ ul.task-bullet > li > p:first-child { /* Getting started index page */ -.intro-card { - background:#FFF; +.comparison-card { + background-color: var(--pst-color-background); border-radius:0; padding: 30px 10px 10px 10px; margin: 10px 0px; } -.intro-card .card-text { - margin:20px 0px; - /*min-height: 150px; */ +.comparison-card p.card-text { + margin: 0px; } -.intro-card .card-img-top { +.comparison-card .card-img-top { margin: 10px; + margin-bottom: 20px; + height: 72px; + background: none !important; +} + +.comparison-card-excel .card-img-top, .comparison-card-stata .card-img-top, .comparison-card-sas .card-img-top { + height: 52px; +} + +.comparison-card .card-footer { + border: none; + background-color: var(--pst-color-background); } .install-block { @@ -153,16 +160,18 @@ ul.task-bullet > li > p:first-child { .install-card .card-header { border: none; - background-color:white; - color: #150458; + background-color: transparent; + padding: 1rem 1rem 0rem 1rem; +} + +.install-card .card-header p.card-text { font-size: 1.1rem; font-weight: bold; - padding: 1rem 1rem 0rem 1rem; } .install-card .card-footer { border: none; - background-color:white; + background-color: transparent; } .install-card pre { @@ -224,15 +233,16 @@ ul.task-bullet > li > p:first-child { .tutorial-card .card-header { cursor: pointer; - background-color: white; + background-color: var(--pst-color-surface); + border: 1px solid var(--pst-color-border) } .tutorial-card .card-body { - background-color: #F0F0F0; + background-color: var(--pst-color-on-background); } .tutorial-card .badge { - background-color: #130654; + background-color: var(--pst-color-primary); margin: 10px 10px 10px 10px; padding: 5px; } @@ -241,8 +251,9 @@ ul.task-bullet > li > p:first-child { margin: 0px; } + .tutorial-card .gs-badge-link a { - color: white; + color: var(--pst-color-text-base); text-decoration: none; } diff --git a/doc/source/_static/css/pandas.css b/doc/source/_static/css/pandas.css index 43cd631890330..c32a9c8f40ff5 100644 --- a/doc/source/_static/css/pandas.css +++ b/doc/source/_static/css/pandas.css @@ -1,36 +1,53 @@ -/* Getting started index page */ +/* Override some aspects of the pydata-sphinx-theme */ + +:root { + /* Use softer blue from bootstrap's default info color */ + --pst-color-info: 23, 162, 184; +} + +table { + width: auto; /* Override fit-content which breaks Styler user guide ipynb */ +} + +/* Main index page overview cards */ .intro-card { background: #fff; border-radius: 0; - padding: 30px 10px 10px 10px; + padding: 30px 10px 20px 10px; margin: 10px 0px; } -.intro-card .card-text { - margin: 20px 0px; - /*min-height: 150px; */ +.intro-card p.card-text { + margin: 0px; +} + +.intro-card .card-img-top { + margin: 10px; + height: 52px; + background: none !important; } -.custom-button { - background-color: #dcdcdc; +.intro-card .card-header { border: none; - color: #484848; - text-align: center; - text-decoration: none; - display: inline-block; - font-size: 0.9rem; - border-radius: 0.5rem; - max-width: 220px; - padding: 0.5rem 0rem; + background-color: transparent; + color: #150458 !important; + font-size: var(--pst-font-size-h5); + font-weight: bold; + padding: 2.5rem 0rem 0.5rem 0rem; +} + +.intro-card .card-footer { + border: none; + background-color: transparent; } -.custom-button a { - color: #484848; +.intro-card .card-footer p.card-text{ + max-width: 220px; + margin-left: auto; + margin-right: auto; } -.custom-button p { - margin-top: 0; - margin-bottom: 0rem; - color: #484848; +.card, .card img { + background-color: var(--pst-color-background); } diff --git a/doc/source/_static/eval-perf-small.png b/doc/source/_static/eval-perf-small.png deleted file mode 100644 index d86018363ffdc..0000000000000 Binary files a/doc/source/_static/eval-perf-small.png and /dev/null differ diff --git a/doc/source/_static/eval-perf.png b/doc/source/_static/eval-perf.png index 14c69c1b85d9e..ed92337c1d995 100644 Binary files a/doc/source/_static/eval-perf.png and b/doc/source/_static/eval-perf.png differ diff --git a/doc/source/_static/index_api.svg b/doc/source/_static/index_api.svg index 70bf0d3504b1a..69f7ba1d2d114 100644 --- a/doc/source/_static/index_api.svg +++ b/doc/source/_static/index_api.svg @@ -64,29 +64,29 @@ inkscape:connector-curvature="0" id="path899" d="M 324.96812,187.09499 H 303.0455 v 72.1639 h 22.67969" - style="fill:none;stroke:#150458;stroke-width:10;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> + style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> + style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> + style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> + style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> diff --git a/doc/source/_static/index_getting_started.svg b/doc/source/_static/index_getting_started.svg index d00e462427193..2d36622cb7e55 100644 --- a/doc/source/_static/index_getting_started.svg +++ b/doc/source/_static/index_getting_started.svg @@ -58,7 +58,7 @@ id="layer1" transform="translate(2.9219487,-8.5995374)"> diff --git a/doc/source/_static/index_user_guide.svg b/doc/source/_static/index_user_guide.svg index a567103af5918..bd170535170a3 100644 --- a/doc/source/_static/index_user_guide.svg +++ b/doc/source/_static/index_user_guide.svg @@ -58,7 +58,7 @@ id="layer1" transform="translate(141.8903,-20.32143)"> + + + + + + + + \ No newline at end of file diff --git a/doc/source/_static/logo_sql.svg b/doc/source/_static/logo_sql.svg index 4a5b7d0b1b943..38b3b2c726214 100644 --- a/doc/source/_static/logo_sql.svg +++ b/doc/source/_static/logo_sql.svg @@ -58,10 +58,10 @@ d="m 18.846017,1.608 c -0.497,-0.326 -1.193,-0.615 -2.069,-0.858 -1.742,-0.484 -4.05,-0.75 -6.498,-0.75 -2.4480004,0 -4.7560004,0.267 -6.4980004,0.75 -0.877,0.243 -1.573,0.532 -2.069,0.858 -0.619,0.407 -0.93299996,0.874 -0.93299996,1.391 v 12 c 0,0.517 0.31399996,0.985 0.93299996,1.391 0.497,0.326 1.193,0.615 2.069,0.858 1.742,0.484 4.05,0.75 6.4980004,0.75 2.448,0 4.756,-0.267 6.498,-0.751 0.877,-0.243 1.573,-0.532 2.069,-0.858 0.619,-0.406 0.933,-0.874 0.933,-1.391 v -12 c 0,-0.517 -0.314,-0.985 -0.933,-1.391 z M 4.0490166,1.713 c 1.658,-0.46 3.87,-0.714 6.2300004,-0.714 2.36,0 4.573,0.254 6.23,0.714 1.795,0.499 2.27,1.059 2.27,1.286 0,0.227 -0.474,0.787 -2.27,1.286 -1.658,0.46 -3.87,0.714 -6.23,0.714 -2.3600004,0 -4.5730004,-0.254 -6.2300004,-0.714 -1.795,-0.499 -2.27,-1.059 -2.27,-1.286 0,-0.227 0.474,-0.787 2.27,-1.286 z M 16.509017,16.285 c -1.658,0.46 -3.87,0.714 -6.23,0.714 -2.3600004,0 -4.5730004,-0.254 -6.2300004,-0.714 -1.795,-0.499 -2.27,-1.059 -2.27,-1.286 v -2.566 c 0.492,0.309 1.164,0.583 2.002,0.816 1.742,0.484 4.05,0.75 6.4980004,0.75 2.448,0 4.756,-0.267 6.498,-0.751 0.838,-0.233 1.511,-0.507 2.002,-0.816 v 2.566 c 0,0.227 -0.474,0.787 -2.27,1.286 z m 0,-4 c -1.658,0.46 -3.87,0.714 -6.23,0.714 -2.3600004,0 -4.5730004,-0.254 -6.2300004,-0.714 -1.795,-0.499 -2.27,-1.059 -2.27,-1.286 V 8.433 c 0.492,0.309 1.164,0.583 2.002,0.816 1.742,0.484 4.05,0.75 6.4980004,0.75 2.448,0 4.756,-0.267 6.498,-0.75 0.838,-0.233 1.511,-0.507 2.002,-0.816 v 2.566 c 0,0.227 -0.474,0.787 -2.27,1.286 z m 0,-4 c -1.658,0.46 -3.87,0.714 -6.23,0.714 -2.3600004,0 -4.5730004,-0.254 -6.2300004,-0.714 -1.795,-0.499 -2.27,-1.059 -2.27,-1.286 V 4.433 c 0.492,0.309 1.164,0.583 2.002,0.816 1.742,0.484 4.05,0.75 6.4980004,0.75 2.448,0 4.756,-0.267 6.498,-0.75 0.838,-0.233 1.511,-0.507 2.002,-0.816 v 2.566 c 0,0.227 -0.474,0.787 -2.27,1.286 z" id="path2" inkscape:connector-curvature="0" - style="fill:#000000" /> + style="fill:#888888" /> + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/source/_static/spreadsheets/pivot.png b/doc/source/_static/spreadsheets/pivot.png new file mode 100644 index 0000000000000..beacc90bc313e Binary files /dev/null and b/doc/source/_static/spreadsheets/pivot.png differ diff --git a/doc/source/_static/spreadsheets/sort.png b/doc/source/_static/spreadsheets/sort.png new file mode 100644 index 0000000000000..253f2f3bfb9ba Binary files /dev/null and b/doc/source/_static/spreadsheets/sort.png differ diff --git a/doc/source/_static/spreadsheets/vlookup.png b/doc/source/_static/spreadsheets/vlookup.png new file mode 100644 index 0000000000000..e96da01da1eeb Binary files /dev/null and b/doc/source/_static/spreadsheets/vlookup.png differ diff --git a/doc/source/_static/style/appmaphead1.png b/doc/source/_static/style/appmaphead1.png new file mode 100644 index 0000000000000..905bcaa63e900 Binary files /dev/null and b/doc/source/_static/style/appmaphead1.png differ diff --git a/doc/source/_static/style/appmaphead2.png b/doc/source/_static/style/appmaphead2.png new file mode 100644 index 0000000000000..9adde61908378 Binary files /dev/null and b/doc/source/_static/style/appmaphead2.png differ diff --git a/doc/source/_static/style/bg_ax0.png b/doc/source/_static/style/bg_ax0.png new file mode 100644 index 0000000000000..1767d34136a02 Binary files /dev/null and b/doc/source/_static/style/bg_ax0.png differ diff --git a/doc/source/_static/style/bg_axNone.png b/doc/source/_static/style/bg_axNone.png new file mode 100644 index 0000000000000..8882c6f689773 Binary files /dev/null and b/doc/source/_static/style/bg_axNone.png differ diff --git a/doc/source/_static/style/bg_axNone_gmap.png b/doc/source/_static/style/bg_axNone_gmap.png new file mode 100644 index 0000000000000..bdd2b55e8c6b4 Binary files /dev/null and b/doc/source/_static/style/bg_axNone_gmap.png differ diff --git a/doc/source/_static/style/bg_axNone_lowhigh.png b/doc/source/_static/style/bg_axNone_lowhigh.png new file mode 100644 index 0000000000000..c37a707e73692 Binary files /dev/null and b/doc/source/_static/style/bg_axNone_lowhigh.png differ diff --git a/doc/source/_static/style/bg_axNone_vminvmax.png b/doc/source/_static/style/bg_axNone_vminvmax.png new file mode 100644 index 0000000000000..4ca958de15ec3 Binary files /dev/null and b/doc/source/_static/style/bg_axNone_vminvmax.png differ diff --git a/doc/source/_static/style/bg_gmap.png b/doc/source/_static/style/bg_gmap.png new file mode 100644 index 0000000000000..039ff6b78958e Binary files /dev/null and b/doc/source/_static/style/bg_gmap.png differ diff --git a/doc/source/_static/style/df_pipe.png b/doc/source/_static/style/df_pipe.png new file mode 100644 index 0000000000000..071a481ad5acc Binary files /dev/null and b/doc/source/_static/style/df_pipe.png differ diff --git a/doc/source/_static/style/df_pipe_applydata.png b/doc/source/_static/style/df_pipe_applydata.png new file mode 100644 index 0000000000000..a2d5aa514e311 Binary files /dev/null and b/doc/source/_static/style/df_pipe_applydata.png differ diff --git a/doc/source/_static/style/df_pipe_applymap.png b/doc/source/_static/style/df_pipe_applymap.png new file mode 100644 index 0000000000000..cd493c78452ef Binary files /dev/null and b/doc/source/_static/style/df_pipe_applymap.png differ diff --git a/doc/source/_static/style/df_pipe_hl.png b/doc/source/_static/style/df_pipe_hl.png new file mode 100644 index 0000000000000..2238a55ab1ce3 Binary files /dev/null and b/doc/source/_static/style/df_pipe_hl.png differ diff --git a/doc/source/_static/style/df_pipe_hl2.png b/doc/source/_static/style/df_pipe_hl2.png new file mode 100644 index 0000000000000..7025a7c373d92 Binary files /dev/null and b/doc/source/_static/style/df_pipe_hl2.png differ diff --git a/doc/source/_static/style/footer_extended.png b/doc/source/_static/style/footer_extended.png new file mode 100644 index 0000000000000..3699d61ad4346 Binary files /dev/null and b/doc/source/_static/style/footer_extended.png differ diff --git a/doc/source/_static/style/footer_simple.png b/doc/source/_static/style/footer_simple.png new file mode 100644 index 0000000000000..56dc3c09cc700 Binary files /dev/null and b/doc/source/_static/style/footer_simple.png differ diff --git a/doc/source/_static/style/format_excel_css.png b/doc/source/_static/style/format_excel_css.png new file mode 100644 index 0000000000000..0bd4662c3f2d0 Binary files /dev/null and b/doc/source/_static/style/format_excel_css.png differ diff --git a/doc/source/_static/style/hbetw_axNone.png b/doc/source/_static/style/hbetw_axNone.png new file mode 100644 index 0000000000000..2918131b40bde Binary files /dev/null and b/doc/source/_static/style/hbetw_axNone.png differ diff --git a/doc/source/_static/style/hbetw_basic.png b/doc/source/_static/style/hbetw_basic.png new file mode 100644 index 0000000000000..1d8e015aec37f Binary files /dev/null and b/doc/source/_static/style/hbetw_basic.png differ diff --git a/doc/source/_static/style/hbetw_props.png b/doc/source/_static/style/hbetw_props.png new file mode 100644 index 0000000000000..56bbe8479d564 Binary files /dev/null and b/doc/source/_static/style/hbetw_props.png differ diff --git a/doc/source/_static/style/hbetw_seq.png b/doc/source/_static/style/hbetw_seq.png new file mode 100644 index 0000000000000..0fc3108a7968c Binary files /dev/null and b/doc/source/_static/style/hbetw_seq.png differ diff --git a/doc/source/_static/style/hq_ax1.png b/doc/source/_static/style/hq_ax1.png new file mode 100644 index 0000000000000..95d840b7c8f99 Binary files /dev/null and b/doc/source/_static/style/hq_ax1.png differ diff --git a/doc/source/_static/style/hq_axNone.png b/doc/source/_static/style/hq_axNone.png new file mode 100644 index 0000000000000..40a33b194e640 Binary files /dev/null and b/doc/source/_static/style/hq_axNone.png differ diff --git a/doc/source/_static/style/hq_props.png b/doc/source/_static/style/hq_props.png new file mode 100644 index 0000000000000..1f11749096690 Binary files /dev/null and b/doc/source/_static/style/hq_props.png differ diff --git a/doc/source/_static/style/latex_1.png b/doc/source/_static/style/latex_1.png new file mode 100644 index 0000000000000..8b901878a0ec9 Binary files /dev/null and b/doc/source/_static/style/latex_1.png differ diff --git a/doc/source/_static/style/latex_2.png b/doc/source/_static/style/latex_2.png new file mode 100644 index 0000000000000..7d6baa681575e Binary files /dev/null and b/doc/source/_static/style/latex_2.png differ diff --git a/doc/source/_static/style/latex_stocks.png b/doc/source/_static/style/latex_stocks.png new file mode 100644 index 0000000000000..c8906c33b810b Binary files /dev/null and b/doc/source/_static/style/latex_stocks.png differ diff --git a/doc/source/_static/style/latex_stocks_html.png b/doc/source/_static/style/latex_stocks_html.png new file mode 100644 index 0000000000000..11b30faddf47c Binary files /dev/null and b/doc/source/_static/style/latex_stocks_html.png differ diff --git a/doc/source/_static/style/tg_ax0.png b/doc/source/_static/style/tg_ax0.png new file mode 100644 index 0000000000000..3460329352282 Binary files /dev/null and b/doc/source/_static/style/tg_ax0.png differ diff --git a/doc/source/_static/style/tg_axNone.png b/doc/source/_static/style/tg_axNone.png new file mode 100644 index 0000000000000..00357f7eb016b Binary files /dev/null and b/doc/source/_static/style/tg_axNone.png differ diff --git a/doc/source/_static/style/tg_axNone_gmap.png b/doc/source/_static/style/tg_axNone_gmap.png new file mode 100644 index 0000000000000..d06a4b244a23d Binary files /dev/null and b/doc/source/_static/style/tg_axNone_gmap.png differ diff --git a/doc/source/_static/style/tg_axNone_lowhigh.png b/doc/source/_static/style/tg_axNone_lowhigh.png new file mode 100644 index 0000000000000..bc3fb16ee8e40 Binary files /dev/null and b/doc/source/_static/style/tg_axNone_lowhigh.png differ diff --git a/doc/source/_static/style/tg_axNone_vminvmax.png b/doc/source/_static/style/tg_axNone_vminvmax.png new file mode 100644 index 0000000000000..42579c2840fb9 Binary files /dev/null and b/doc/source/_static/style/tg_axNone_vminvmax.png differ diff --git a/doc/source/_static/style/tg_gmap.png b/doc/source/_static/style/tg_gmap.png new file mode 100644 index 0000000000000..fb73529544180 Binary files /dev/null and b/doc/source/_static/style/tg_gmap.png differ diff --git a/doc/source/conf.py b/doc/source/conf.py index ee0d4ca3f2a24..6671cefae9073 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -9,13 +9,13 @@ # # All configuration values have a default; values that are commented out # serve to show the default. - from datetime import datetime import importlib import inspect import logging import os import sys +import warnings import jinja2 from numpydoc.docscrape import NumpyDocString @@ -50,35 +50,43 @@ # sphinxext. extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", - "sphinx.ext.doctest", - "sphinx.ext.extlinks", - "sphinx.ext.todo", - "numpydoc", # handle NumPy documentation formatted docstrings + "contributors", # custom pandas extension "IPython.sphinxext.ipython_directive", "IPython.sphinxext.ipython_console_highlighting", "matplotlib.sphinxext.plot_directive", - "sphinx.ext.intersphinx", + "numpydoc", + "sphinx_copybutton", + "sphinx_panels", + "sphinx_toggleprompt", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", "sphinx.ext.coverage", - "sphinx.ext.mathjax", + "sphinx.ext.doctest", + "sphinx.ext.extlinks", "sphinx.ext.ifconfig", + "sphinx.ext.intersphinx", "sphinx.ext.linkcode", + "sphinx.ext.mathjax", + "sphinx.ext.todo", "nbsphinx", - "contributors", # custom pandas extension ] -exclude_patterns = ["**.ipynb_checkpoints"] +exclude_patterns = [ + "**.ipynb_checkpoints", + # to ensure that include files (partial pages) aren't built, exclude them + # https://github.com/sphinx-doc/sphinx/issues/1965#issuecomment-124732907 + "**/includes/**", +] try: import nbconvert except ImportError: - logger.warn("nbconvert not installed. Skipping notebooks.") + logger.warning("nbconvert not installed. Skipping notebooks.") exclude_patterns.append("**/*.ipynb") else: try: nbconvert.utils.pandoc.get_pandoc_version() except nbconvert.utils.pandoc.PandocMissing: - logger.warn("Pandoc not installed. Skipping notebooks.") + logger.warning("Pandoc not installed. Skipping notebooks.") exclude_patterns.append("**/*.ipynb") # sphinx_pattern can be '-api' to exclude the API pages, @@ -86,17 +94,26 @@ # (e.g. '10min.rst' or 'pandas.DataFrame.head') source_path = os.path.dirname(os.path.abspath(__file__)) pattern = os.environ.get("SPHINX_PATTERN") +single_doc = pattern is not None and pattern not in ("-api", "whatsnew") +include_api = pattern is None or pattern == "whatsnew" if pattern: for dirname, dirs, fnames in os.walk(source_path): + reldir = os.path.relpath(dirname, source_path) for fname in fnames: if os.path.splitext(fname)[-1] in (".rst", ".ipynb"): fname = os.path.relpath(os.path.join(dirname, fname), source_path) if fname == "index.rst" and os.path.abspath(dirname) == source_path: continue - elif pattern == "-api" and dirname == "reference": + if pattern == "-api" and reldir.startswith("reference"): + exclude_patterns.append(fname) + elif ( + pattern == "whatsnew" + and not reldir.startswith("reference") + and reldir != "whatsnew" + ): exclude_patterns.append(fname) - elif pattern != "-api" and fname != pattern: + elif single_doc and fname != pattern: exclude_patterns.append(fname) with open(os.path.join(source_path, "index.rst.template")) as f: @@ -104,11 +121,11 @@ with open(os.path.join(source_path, "index.rst"), "w") as f: f.write( t.render( - include_api=pattern is None, - single_doc=(pattern if pattern is not None and pattern != "-api" else None), + include_api=include_api, + single_doc=(pattern if single_doc else None), ) ) -autosummary_generate = True if pattern is None else ["index"] +autosummary_generate = True if include_api else ["index"] autodoc_typehints = "none" # numpydoc @@ -125,6 +142,13 @@ # nbsphinx do not use requirejs (breaks bootstrap) nbsphinx_requirejs_path = "" +# sphinx-panels shouldn't add bootstrap css since the pydata-sphinx-theme +# already loads it +panels_add_bootstrap_css = False + +# https://sphinx-toggleprompt.readthedocs.io/en/stable/#offset +toggleprompt_offset_right = 35 + # Add any paths that contain templates here, relative to this directory. templates_path = ["../_templates"] @@ -139,14 +163,15 @@ # General information about the project. project = "pandas" -copyright = f"2008-{datetime.now().year}, the pandas development team" +# We have our custom "pandas_footer.html" template, using copyright for the current year +copyright = f"{datetime.now().year}" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -import pandas # noqa: E402 isort:skip +import pandas # isort:skip # version = '%s r%s' % (pandas.__version__, svn_version()) version = str(pandas.__version__) @@ -156,7 +181,7 @@ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -# language = None +language = "en" # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: @@ -206,11 +231,25 @@ # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. + +switcher_version = version +if ".dev" in version: + switcher_version = "dev" +elif "rc" in version: + switcher_version = version.split("rc", maxsplit=1)[0] + " (rc)" + html_theme_options = { "external_links": [], + "footer_items": ["pandas_footer", "sphinx-version"], "github_url": "https://github.com/pandas-dev/pandas", "twitter_url": "https://twitter.com/pandas_dev", "google_analytics_id": "UA-27880019-2", + "logo": {"image_dark": "https://pandas.pydata.org/static/img/pandas_white.svg"}, + "navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"], + "switcher": { + "json_url": "/versions.json", + "version_match": switcher_version, + }, } # Add any paths that contain custom themes here, relative to this directory. @@ -308,9 +347,9 @@ for method in methods: # ... and each of its public methods - moved_api_pages.append((f"{old}.{method}", f"{new}.{method}",)) + moved_api_pages.append((f"{old}.{method}", f"{new}.{method}")) -if pattern is None: +if include_api: html_additional_pages = { "generated/" + page[0]: "api_redirect.html" for page in moved_api_pages } @@ -335,7 +374,7 @@ html_context = { - "redirects": {old: new for old, new in moved_api_pages}, + "redirects": dict(moved_api_pages), "header": header, } @@ -406,28 +445,26 @@ # latex_use_modindex = True -if pattern is None: +if include_api: intersphinx_mapping = { "dateutil": ("https://dateutil.readthedocs.io/en/latest/", None), - "matplotlib": ("https://matplotlib.org/", None), + "matplotlib": ("https://matplotlib.org/stable/", None), "numpy": ("https://numpy.org/doc/stable/", None), "pandas-gbq": ("https://pandas-gbq.readthedocs.io/en/latest/", None), "py": ("https://pylib.readthedocs.io/en/latest/", None), "python": ("https://docs.python.org/3/", None), - "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None), - "statsmodels": ("https://www.statsmodels.org/devel/", None), + "scipy": ("https://docs.scipy.org/doc/scipy/", None), "pyarrow": ("https://arrow.apache.org/docs/", None), } # extlinks alias extlinks = { "issue": ("https://github.com/pandas-dev/pandas/issues/%s", "GH"), - "wiki": ("https://github.com/pandas-dev/pandas/wiki/%s", "wiki "), } ipython_warning_is_error = False -ipython_exec_lines = [ +ipython_execlines = [ "import numpy as np", "import pandas as pd", # This ensures correct rendering on system with console encoding != utf8 @@ -441,14 +478,13 @@ # Add custom Documenter to handle attributes/methods of an AccessorProperty # eg pandas.Series.str and pandas.Series.dt (see GH9322) -import sphinx # noqa: E402 isort:skip -from sphinx.util import rpartition # noqa: E402 isort:skip -from sphinx.ext.autodoc import ( # noqa: E402 isort:skip +import sphinx # isort:skip +from sphinx.ext.autodoc import ( # isort:skip AttributeDocumenter, Documenter, MethodDocumenter, ) -from sphinx.ext.autosummary import Autosummary # noqa: E402 isort:skip +from sphinx.ext.autosummary import Autosummary # isort:skip class AccessorDocumenter(MethodDocumenter): @@ -502,8 +538,8 @@ def resolve_name(self, modname, parents, path, base): # HACK: this is added in comparison to ClassLevelDocumenter # mod_cls still exists of class.accessor, so an extra # rpartition is needed - modname, accessor = rpartition(mod_cls, ".") - modname, cls = rpartition(modname, ".") + modname, _, accessor = mod_cls.rpartition(".") + modname, _, cls = modname.rpartition(".") parents = [cls, accessor] # if the module name is still missing, get it like above if not modname: @@ -547,7 +583,14 @@ class AccessorCallableDocumenter(AccessorLevelDocumenter, MethodDocumenter): priority = 0.5 def format_name(self): - return MethodDocumenter.format_name(self).rstrip(".__call__") + if sys.version_info < (3, 9): + # NOTE pyupgrade will remove this when we run it with --py39-plus + # so don't remove the unnecessary `else` statement below + from pandas.util._str_methods import removesuffix + + return removesuffix(MethodDocumenter.format_name(self), ".__call__") + else: + return MethodDocumenter.format_name(self).removesuffix(".__call__") class PandasAutosummary(Autosummary): @@ -609,19 +652,30 @@ def linkcode_resolve(domain, info): obj = submod for part in fullname.split("."): try: - obj = getattr(obj, part) + with warnings.catch_warnings(): + # Accessing deprecated objects will generate noisy warnings + warnings.simplefilter("ignore", FutureWarning) + obj = getattr(obj, part) except AttributeError: return None try: fn = inspect.getsourcefile(inspect.unwrap(obj)) except TypeError: - fn = None + try: # property + fn = inspect.getsourcefile(inspect.unwrap(obj.fget)) + except (AttributeError, TypeError): + fn = None if not fn: return None try: source, lineno = inspect.getsourcelines(obj) + except TypeError: + try: # property + source, lineno = inspect.getsourcelines(obj.fget) + except (AttributeError, TypeError): + lineno = None except OSError: lineno = None @@ -633,7 +687,7 @@ def linkcode_resolve(domain, info): fn = os.path.relpath(fn, start=os.path.dirname(pandas.__file__)) if "+" in pandas.__version__: - return f"https://github.com/pandas-dev/pandas/blob/master/pandas/%7Bfn%7D%7Blinespec%7D" + return f"https://github.com/pandas-dev/pandas/blob/main/pandas/%7Bfn%7D%7Blinespec%7D" else: return ( f"https://github.com/pandas-dev/pandas/blob/" @@ -687,6 +741,30 @@ def process_class_docstrings(app, what, name, obj, options, lines): lines[:] = joined.split("\n") +_BUSINED_ALIASES = [ + "pandas.tseries.offsets." + name + for name in [ + "BDay", + "CDay", + "BMonthEnd", + "BMonthBegin", + "CBMonthEnd", + "CBMonthBegin", + ] +] + + +def process_business_alias_docstrings(app, what, name, obj, options, lines): + """ + Starting with sphinx 3.4, the "autodoc-process-docstring" event also + gets called for alias classes. This results in numpydoc adding the + methods/attributes to the docstring, which we don't want (+ this + causes warnings with sphinx). + """ + if name in _BUSINED_ALIASES: + lines[:] = [] + + suppress_warnings = [ # We "overwrite" autosummary with our PandasAutosummary, but # still want the regular autosummary setup to run. So we just @@ -716,6 +794,7 @@ def setup(app): app.connect("source-read", rstjinja) app.connect("autodoc-process-docstring", remove_flags_docstring) app.connect("autodoc-process-docstring", process_class_docstrings) + app.connect("autodoc-process-docstring", process_business_alias_docstrings) app.add_autodocumenter(AccessorDocumenter) app.add_autodocumenter(AccessorAttributeDocumenter) app.add_autodocumenter(AccessorMethodDocumenter) diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst deleted file mode 100644 index 11d0c35f92ff5..0000000000000 --- a/doc/source/development/code_style.rst +++ /dev/null @@ -1,176 +0,0 @@ -.. _code_style: - -{{ header }} - -======================= -pandas code style guide -======================= - -.. contents:: Table of contents: - :local: - -*pandas* follows the `PEP8 `_ -standard and uses `Black `_ -and `Flake8 `_ to ensure a -consistent code format throughout the project. For details see the -:ref:`contributing guide to pandas`. - -Patterns -======== - -Using foo.__class__ -------------------- - - -pandas uses 'type(foo)' instead 'foo.__class__' as it is making the code more -readable. -For example: - -**Good:** - -.. code-block:: python - - foo = "bar" - type(foo) - -**Bad:** - -.. code-block:: python - - foo = "bar" - foo.__class__ - - -String formatting -================= - -Concatenated strings --------------------- - -Using f-strings -~~~~~~~~~~~~~~~ - -pandas uses f-strings formatting instead of '%' and '.format()' string formatters. - -The convention of using f-strings on a string that is concatenated over several lines, -is to prefix only the lines containing values which need to be interpreted. - -For example: - -**Good:** - -.. code-block:: python - - foo = "old_function" - bar = "new_function" - - my_warning_message = ( - f"Warning, {foo} is deprecated, " - "please use the new and way better " - f"{bar}" - ) - -**Bad:** - -.. code-block:: python - - foo = "old_function" - bar = "new_function" - - my_warning_message = ( - f"Warning, {foo} is deprecated, " - f"please use the new and way better " - f"{bar}" - ) - -White spaces -~~~~~~~~~~~~ - -Only put white space at the end of the previous line, so -there is no whitespace at the beginning of the concatenated string. - -For example: - -**Good:** - -.. code-block:: python - - example_string = ( - "Some long concatenated string, " - "with good placement of the " - "whitespaces" - ) - -**Bad:** - -.. code-block:: python - - example_string = ( - "Some long concatenated string," - " with bad placement of the" - " whitespaces" - ) - -Representation function (aka 'repr()') --------------------------------------- - -pandas uses 'repr()' instead of '%r' and '!r'. - -The use of 'repr()' will only happen when the value is not an obvious string. - -For example: - -**Good:** - -.. code-block:: python - - value = str - f"Unknown received value, got: {repr(value)}" - -**Good:** - -.. code-block:: python - - value = str - f"Unknown received type, got: '{type(value).__name__}'" - - -Imports (aim for absolute) -========================== - -In Python 3, absolute imports are recommended. Using absolute imports, doing something -like ``import string`` will import the string module rather than ``string.py`` -in the same directory. As much as possible, you should try to write out -absolute imports that show the whole import chain from top-level pandas. - -Explicit relative imports are also supported in Python 3 but it is not -recommended to use them. Implicit relative imports should never be used -and are removed in Python 3. - -For example: - -:: - - # preferred - import pandas.core.common as com - - # not preferred - from .common import test_base - - # wrong - from common import test_base - - -Miscellaneous -============= - -Reading from a url ------------------- - -**Good:** - -.. code-block:: python - - from pandas.io.common import urlopen - with urlopen('http://www.google.com/') as url: - raw_text = url.read() diff --git a/doc/source/development/community.rst b/doc/source/development/community.rst new file mode 100644 index 0000000000000..9a4de3c2580ab --- /dev/null +++ b/doc/source/development/community.rst @@ -0,0 +1,121 @@ +.. _community: + +===================== +Contributor community +===================== + +pandas is a community-driven open source project developed by a large group +of `contributors `_ +and a smaller group of `maintainers `_. +The pandas leadership has made a strong commitment to creating an open, +inclusive, and positive community. Please read the pandas `Code of Conduct +`_ for guidance on how to +interact with others in a way that makes the community thrive. + +We offer several meetings and communication channels to share knowledge and +connect with others within the pandas community. + +Community meeting +----------------- + +The pandas Community Meeting is a regular sync meeting for the project's +maintainers which is open to the community. Everyone is welcome to attend and +contribute to conversations. + +The meetings take place on the second and fourth Wednesdays of each month at 18:00 UTC. + +The minutes of past meetings are available in `this Google Document `__. + + +New contributor meeting +----------------------- + +On the third Wednesday of the month, we hold meetings to welcome and support +new contributors in our community. + +| 👋 you all are invited +| 💬 everyone can present (add yourself to the hackMD agenda) +| 👀 anyone can sit in and listen + +Attendees are new and experienced contributors, as well as a few maintainers. +We aim to answer questions about getting started, or help with work in +progress when possible, as well as get to know each other and share our +learnings and experiences. + +The agenda for the next meeting and minutes of past meetings are available in +`this HackMD `__. + +Calendar +-------- + +This calendar shows all the community meetings. Our community meetings are +ideal for anyone wanting to contribute to pandas, or just curious to know how +current development is going. + +.. raw:: html + + + +You can subscribe to this calendar with the following links: + +* `iCal `__ +* `Google calendar `__ + +Additionally, we'll sometimes have one-off meetings on specific topics. +These will be published on the same calendar. + +`GitHub issue tracker `_ +---------------------------------------------------------------------- + +The pandas contributor community conducts conversations mainly via this channel. +Any community member can open issues to: + +- Report bugs, e.g. "I noticed the behavior of a certain function is + incorrect" +- Request features, e.g. "I would like this error message to be more readable" +- Request documentation improvements, e.g. "I found this section unclear" +- Ask questions, e.g. "I noticed the behavior of a certain function + changed between versions. Is this expected?". + + Ideally, your questions should be related to how pandas works rather + than how you use pandas. `StackOverflow `_ is + better suited for answering usage questions, and we ask that all usage + questions are first asked on StackOverflow. Thank you for respecting our + time and wishes. 🙇 + +Maintainers and frequent contributors might also open issues to discuss the +ongoing development of the project. For example: + +- Report issues with the CI, GitHub Actions, or the performance of pandas +- Open issues relating to the internals +- Start roadmap discussion aligning on proposals for what to do in future + releases or changes to the API. +- Open issues relating to the project's website, logo, or governance + +The developer mailing list +-------------------------- + +The pandas mailing list `pandas-dev@python.org `_ is used for long form +conversations and to engage people in the wider community who might not +be active on the issue tracker but we would like to include in discussions. + +.. _community.slack: + +Community slack +--------------- + +We have a chat platform for contributors, maintainers and potential +contributors. This is not a space for user questions, rather for questions about +contributing to pandas. The slack is a private space, specifically meant for +people who are hesitant to bring up their questions or ideas on a large public +mailing list or GitHub. + +If this sounds like the right place for you, you are welcome to join using +`this link `_! +Please remember to follow our `Code of Conduct `_, +and be aware that our admins are monitoring for irrelevant messages and will remove folks who use +our +slack for spam, advertisements and messages not related to the pandas contributing community. And +please remember that slack is not meant to replace the mailing list or issue tracker - all important +announcements and conversations should still happen there. diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index b85e9403038ab..d779e31253e09 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -31,13 +31,13 @@ comment letting others know they are working on an issue. While this is ok, you check each issue individually, and it's not possible to find the unassigned ones. For this reason, we implemented a workaround consisting of adding a comment with the exact -text `take`. When you do it, a GitHub action will automatically assign you the issue +text ``take``. When you do it, a GitHub action will automatically assign you the issue (this will take seconds, and may require refreshing the page to see it). By doing this, it's possible to filter the list of issues and find only the unassigned ones. So, a good way to find an issue to start contributing to pandas is to check the list of `unassigned good first issues `_ -and assign yourself one you like by writing a comment with the exact text `take`. +and assign yourself one you like by writing a comment with the exact text ``take``. If for whatever reason you are not able to continue working with the issue, please try to unassign it, so other people know it's available again. You can check the list of @@ -45,8 +45,13 @@ assigned issues, since people may not be working in them anymore. If you want to that is assigned, feel free to kindly ask the current assignee if you can take it (please allow at least a week of inactivity before considering work in the issue discontinued). -Feel free to ask questions on the `mailing list -`_ or on `Gitter`_. +We have several :ref:`contributor community ` communication channels, which you are +welcome to join, and ask questions as you figure things out. Among them are regular meetings for +new contributors, dev meetings, a dev mailing list, and a slack for the contributor community. +All pandas contributors are welcome to these spaces, where they can connect with each other. Even +maintainers who have been with us for a long time felt just like you when they started out, and +are happy to welcome you and support you as you get to know how we work, and where things are. +Take a look at the next sections to learn more. .. _contributing.bug_reports: @@ -59,7 +64,7 @@ will allow others to reproduce the bug and provide insight into fixing. See `this blogpost `_ for tips on writing a good bug report. -Trying the bug-producing code out on the *master* branch is often a worthwhile exercise +Trying the bug-producing code out on the *main* branch is often a worthwhile exercise to confirm the bug still exists. It is also worth searching existing bug reports and pull requests to see if the issue has already been reported and/or fixed. @@ -109,9 +114,10 @@ version control to allow many people to work together on the project. Some great resources for learning Git: +* the `Git documentation `_. * the `GitHub help pages `_. -* the `NumPy's documentation `_. -* Matthew Brett's `Pydagogue `_. +* the `NumPy documentation `_. +* Matthew Brett's `Pydagogue `_. Getting started with Git ------------------------ @@ -132,221 +138,19 @@ want to clone your fork to your machine:: git clone https://github.com/your-user-name/pandas.git pandas-yourname cd pandas-yourname git remote add upstream https://github.com/pandas-dev/pandas.git + git fetch upstream -This creates the directory `pandas-yourname` and connects your repository to +This creates the directory ``pandas-yourname`` and connects your repository to the upstream (main project) *pandas* repository. Note that performing a shallow clone (with ``--depth==N``, for some ``N`` greater or equal to 1) might break some tests and features as ``pd.show_versions()`` as the version number cannot be computed anymore. -.. _contributing.dev_env: - -Creating a development environment ----------------------------------- - -To test out code changes, you'll need to build pandas from source, which -requires a C compiler and Python environment. If you're making documentation -changes, you can skip to :ref:`contributing.documentation` but you won't be able -to build the documentation locally before pushing your changes. - -Using a Docker container -~~~~~~~~~~~~~~~~~~~~~~~~ - -Instead of manually setting up a development environment, you can use Docker to -automatically create the environment with just several commands. Pandas provides a `DockerFile` -in the root directory to build a Docker image with a full pandas development environment. - -Even easier, you can use the DockerFile to launch a remote session with Visual Studio Code, -a popular free IDE, using the `.devcontainer.json` file. -See https://code.visualstudio.com/docs/remote/containers for details. - -.. _contributing.dev_c: - -Installing a C compiler -~~~~~~~~~~~~~~~~~~~~~~~ - -Pandas uses C extensions (mostly written using Cython) to speed up certain -operations. To install pandas from source, you need to compile these C -extensions, which means you need a C compiler. This process depends on which -platform you're using. - -**Windows** - -You will need `Build Tools for Visual Studio 2017 -`_. - -.. warning:: - You DO NOT need to install Visual Studio 2019. - You only need "Build Tools for Visual Studio 2019" found by - scrolling down to "All downloads" -> "Tools for Visual Studio 2019". - -**Mac OS** - -Information about compiler installation can be found here: -https://devguide.python.org/setup/#macos - -**Unix** - -Some Linux distributions will come with a pre-installed C compiler. To find out -which compilers (and versions) are installed on your system:: - - # for Debian/Ubuntu: - dpkg --list | grep compiler - # for Red Hat/RHEL/CentOS/Fedora: - yum list installed | grep -i --color compiler - -`GCC (GNU Compiler Collection) `_, is a widely used -compiler, which supports C and a number of other languages. If GCC is listed -as an installed compiler nothing more is required. If no C compiler is -installed (or you wish to install a newer version) you can install a compiler -(GCC in the example code below) with:: - - # for recent Debian/Ubuntu: - sudo apt install build-essential - # for Red Had/RHEL/CentOS/Fedora - yum groupinstall "Development Tools" - -For other Linux distributions, consult your favourite search engine for -compiler installation instructions. - -Let us know if you have any difficulties by opening an issue or reaching out on -`Gitter`_. - -.. _contributing.dev_python: - -Creating a Python environment -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Now that you have a C compiler, create an isolated pandas development -environment: - -* Install either `Anaconda `_ or `miniconda - `_ -* Make sure your conda is up to date (``conda update conda``) -* Make sure that you have :ref:`cloned the repository ` -* ``cd`` to the pandas source directory - -We'll now kick off a three-step process: - -1. Install the build dependencies -2. Build and install pandas -3. Install the optional dependencies - -.. code-block:: none - - # Create and activate the build environment - conda env create -f environment.yml - conda activate pandas-dev - - # or with older versions of Anaconda: - source activate pandas-dev - - # Build and install pandas - python setup.py build_ext --inplace -j 4 - python -m pip install -e . --no-build-isolation --no-use-pep517 - -At this point you should be able to import pandas from your locally built version:: - - $ python # start an interpreter - >>> import pandas - >>> print(pandas.__version__) - 0.22.0.dev0+29.g4ad6d4d74 - -This will create the new environment, and not touch any of your existing environments, -nor any existing Python installation. - -To view your environments:: - - conda info -e - -To return to your root environment:: - - conda deactivate - -See the full conda docs `here `__. - -.. _contributing.pip: - -Creating a Python environment (pip) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If you aren't using conda for your development environment, follow these instructions. -You'll need to have at least Python 3.6.1 installed on your system. - -**Unix**/**Mac OS with virtualenv** - -.. code-block:: bash - - # Create a virtual environment - # Use an ENV_DIR of your choice. We'll use ~/virtualenvs/pandas-dev - # Any parent directories should already exist - python3 -m venv ~/virtualenvs/pandas-dev - - # Activate the virtualenv - . ~/virtualenvs/pandas-dev/bin/activate - - # Install the build dependencies - python -m pip install -r requirements-dev.txt - - # Build and install pandas - python setup.py build_ext --inplace -j 4 - python -m pip install -e . --no-build-isolation --no-use-pep517 - -**Unix**/**Mac OS with pyenv** - -Consult the docs for setting up pyenv `here `__. - -.. code-block:: bash - - # Create a virtual environment - # Use an ENV_DIR of your choice. We'll use ~/Users//.pyenv/versions/pandas-dev - - pyenv virtualenv - - # For instance: - pyenv virtualenv 3.7.6 pandas-dev - - # Activate the virtualenv - pyenv activate pandas-dev - - # Now install the build dependencies in the cloned pandas repo - python -m pip install -r requirements-dev.txt - - # Build and install pandas - python setup.py build_ext --inplace -j 4 - python -m pip install -e . --no-build-isolation --no-use-pep517 - -**Windows** - -Below is a brief overview on how to set-up a virtual environment with Powershell -under Windows. For details please refer to the -`official virtualenv user guide `__ - -Use an ENV_DIR of your choice. We'll use ~\\virtualenvs\\pandas-dev where -'~' is the folder pointed to by either $env:USERPROFILE (Powershell) or -%USERPROFILE% (cmd.exe) environment variable. Any parent directories -should already exist. - -.. code-block:: powershell - - # Create a virtual environment - python -m venv $env:USERPROFILE\virtualenvs\pandas-dev - - # Activate the virtualenv. Use activate.bat for cmd.exe - ~\virtualenvs\pandas-dev\Scripts\Activate.ps1 - - # Install the build dependencies - python -m pip install -r requirements-dev.txt - - # Build and install pandas - python setup.py build_ext --inplace -j 4 - python -m pip install -e . --no-build-isolation --no-use-pep517 - Creating a branch ----------------- -You want your master branch to reflect only production-ready code, so create a +You want your main branch to reflect only production-ready code, so create a feature branch for making your changes. For example:: git branch shiny-new-feature @@ -361,1034 +165,17 @@ changes in this branch specific to one bug or feature so it is clear what the branch brings to pandas. You can have many shiny-new-features and switch in between them using the git checkout command. -When creating this branch, make sure your master branch is up to date with -the latest upstream master version. To update your local master branch, you +When creating this branch, make sure your main branch is up to date with +the latest upstream main version. To update your local main branch, you can do:: - git checkout master - git pull upstream master --ff-only + git checkout main + git pull upstream main --ff-only -When you want to update the feature branch with changes in master after +When you want to update the feature branch with changes in main after you created the branch, check the section on :ref:`updating a PR `. -.. _contributing.documentation: - -Contributing to the documentation -================================= - -Contributing to the documentation benefits everyone who uses pandas. -We encourage you to help us improve the documentation, and -you don't have to be an expert on pandas to do so! In fact, -there are sections of the docs that are worse off after being written by -experts. If something in the docs doesn't make sense to you, updating the -relevant section after you figure it out is a great way to ensure it will help -the next person. - -.. contents:: Documentation: - :local: - - -About the pandas documentation --------------------------------- - -The documentation is written in **reStructuredText**, which is almost like writing -in plain English, and built using `Sphinx `__. The -Sphinx Documentation has an excellent `introduction to reST -`__. Review the Sphinx docs to perform more -complex changes to the documentation as well. - -Some other important things to know about the docs: - -* The pandas documentation consists of two parts: the docstrings in the code - itself and the docs in this folder ``doc/``. - - The docstrings provide a clear explanation of the usage of the individual - functions, while the documentation in this folder consists of tutorial-like - overviews per topic together with some other information (what's new, - installation, etc). - -* The docstrings follow a pandas convention, based on the **Numpy Docstring - Standard**. Follow the :ref:`pandas docstring guide ` for detailed - instructions on how to write a correct docstring. - - .. toctree:: - :maxdepth: 2 - - contributing_docstring.rst - -* The tutorials make heavy use of the `ipython directive - `_ sphinx extension. - This directive lets you put code in the documentation which will be run - during the doc build. For example:: - - .. ipython:: python - - x = 2 - x**3 - - will be rendered as:: - - In [1]: x = 2 - - In [2]: x**3 - Out[2]: 8 - - Almost all code examples in the docs are run (and the output saved) during the - doc build. This approach means that code examples will always be up to date, - but it does make the doc building a bit more complex. - -* Our API documentation files in ``doc/source/reference`` house the auto-generated - documentation from the docstrings. For classes, there are a few subtleties - around controlling which methods and attributes have pages auto-generated. - - We have two autosummary templates for classes. - - 1. ``_templates/autosummary/class.rst``. Use this when you want to - automatically generate a page for every public method and attribute on the - class. The ``Attributes`` and ``Methods`` sections will be automatically - added to the class' rendered documentation by numpydoc. See ``DataFrame`` - for an example. - - 2. ``_templates/autosummary/class_without_autosummary``. Use this when you - want to pick a subset of methods / attributes to auto-generate pages for. - When using this template, you should include an ``Attributes`` and - ``Methods`` section in the class docstring. See ``CategoricalIndex`` for an - example. - - Every method should be included in a ``toctree`` in one of the documentation files in - ``doc/source/reference``, else Sphinx - will emit a warning. - -.. note:: - - The ``.rst`` files are used to automatically generate Markdown and HTML versions - of the docs. For this reason, please do not edit ``CONTRIBUTING.md`` directly, - but instead make any changes to ``doc/source/development/contributing.rst``. Then, to - generate ``CONTRIBUTING.md``, use `pandoc `_ - with the following command:: - - pandoc doc/source/development/contributing.rst -t markdown_github > CONTRIBUTING.md - -The utility script ``scripts/validate_docstrings.py`` can be used to get a csv -summary of the API documentation. And also validate common errors in the docstring -of a specific class, function or method. The summary also compares the list of -methods documented in the files in ``doc/source/reference`` (which is used to generate -the `API Reference `_ page) -and the actual public methods. -This will identify methods documented in ``doc/source/reference`` that are not actually -class methods, and existing methods that are not documented in ``doc/source/reference``. - - -Updating a pandas docstring ------------------------------ - -When improving a single function or method's docstring, it is not necessarily -needed to build the full documentation (see next section). -However, there is a script that checks a docstring (for example for the ``DataFrame.mean`` method):: - - python scripts/validate_docstrings.py pandas.DataFrame.mean - -This script will indicate some formatting errors if present, and will also -run and test the examples included in the docstring. -Check the :ref:`pandas docstring guide ` for a detailed guide -on how to format the docstring. - -The examples in the docstring ('doctests') must be valid Python code, -that in a deterministic way returns the presented output, and that can be -copied and run by users. This can be checked with the script above, and is -also tested on Travis. A failing doctest will be a blocker for merging a PR. -Check the :ref:`examples ` section in the docstring guide -for some tips and tricks to get the doctests passing. - -When doing a PR with a docstring update, it is good to post the -output of the validation script in a comment on github. - - -How to build the pandas documentation ---------------------------------------- - -Requirements -~~~~~~~~~~~~ - -First, you need to have a development environment to be able to build pandas -(see the docs on :ref:`creating a development environment above `). - -Building the documentation -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -So how do you build the docs? Navigate to your local -``doc/`` directory in the console and run:: - - python make.py html - -Then you can find the HTML output in the folder ``doc/build/html/``. - -The first time you build the docs, it will take quite a while because it has to run -all the code examples and build all the generated docstring pages. In subsequent -evocations, sphinx will try to only build the pages that have been modified. - -If you want to do a full clean build, do:: - - python make.py clean - python make.py html - -You can tell ``make.py`` to compile only a single section of the docs, greatly -reducing the turn-around time for checking your changes. - -:: - - # omit autosummary and API section - python make.py clean - python make.py --no-api - - # compile the docs with only a single section, relative to the "source" folder. - # For example, compiling only this guide (doc/source/development/contributing.rst) - python make.py clean - python make.py --single development/contributing.rst - - # compile the reference docs for a single function - python make.py clean - python make.py --single pandas.DataFrame.join - -For comparison, a full documentation build may take 15 minutes, but a single -section may take 15 seconds. Subsequent builds, which only process portions -you have changed, will be faster. - -You can also specify to use multiple cores to speed up the documentation build:: - - python make.py html --num-jobs 4 - -Open the following file in a web browser to see the full documentation you -just built:: - - doc/build/html/index.html - -And you'll have the satisfaction of seeing your new and improved documentation! - -.. _contributing.dev_docs: - -Building master branch documentation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -When pull requests are merged into the pandas ``master`` branch, the main parts of -the documentation are also built by Travis-CI. These docs are then hosted `here -`__, see also -the :ref:`Continuous Integration ` section. - -.. _contributing.code: - -Contributing to the code base -============================= - -.. contents:: Code Base: - :local: - -Code standards --------------- - -Writing good code is not just about what you write. It is also about *how* you -write it. During :ref:`Continuous Integration ` testing, several -tools will be run to check your code for stylistic errors. -Generating any warnings will cause the test to fail. -Thus, good style is a requirement for submitting code to pandas. - -There is a tool in pandas to help contributors verify their changes before -contributing them to the project:: - - ./ci/code_checks.sh - -The script verifies the linting of code files, it looks for common mistake patterns -(like missing spaces around sphinx directives that make the documentation not -being rendered properly) and it also validates the doctests. It is possible to -run the checks independently by using the parameters ``lint``, ``patterns`` and -``doctests`` (e.g. ``./ci/code_checks.sh lint``). - -In addition, because a lot of people use our library, it is important that we -do not make sudden changes to the code that could have the potential to break -a lot of user code as a result, that is, we need it to be as *backwards compatible* -as possible to avoid mass breakages. - -Additional standards are outlined on the :ref:`pandas code style guide ` - -Optional dependencies ---------------------- - -Optional dependencies (e.g. matplotlib) should be imported with the private helper -``pandas.compat._optional.import_optional_dependency``. This ensures a -consistent error message when the dependency is not met. - -All methods using an optional dependency should include a test asserting that an -``ImportError`` is raised when the optional dependency is not found. This test -should be skipped if the library is present. - -All optional dependencies should be documented in -:ref:`install.optional_dependencies` and the minimum required version should be -set in the ``pandas.compat._optional.VERSIONS`` dict. - -C (cpplint) -~~~~~~~~~~~ - -pandas uses the `Google `_ -standard. Google provides an open source style checker called ``cpplint``, but we -use a fork of it that can be found `here `__. -Here are *some* of the more common ``cpplint`` issues: - -* we restrict line-length to 80 characters to promote readability -* every header file must include a header guard to avoid name collisions if re-included - -:ref:`Continuous Integration ` will run the -`cpplint `_ tool -and report any stylistic errors in your code. Therefore, it is helpful before -submitting code to run the check yourself:: - - cpplint --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir modified-c-file - -You can also run this command on an entire directory if necessary:: - - cpplint --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive modified-c-directory - -To make your commits compliant with this standard, you can install the -`ClangFormat `_ tool, which can be -downloaded `here `__. To configure, in your home directory, -run the following command:: - - clang-format style=google -dump-config > .clang-format - -Then modify the file to ensure that any indentation width parameters are at least four. -Once configured, you can run the tool as follows:: - - clang-format modified-c-file - -This will output what your file will look like if the changes are made, and to apply -them, run the following command:: - - clang-format -i modified-c-file - -To run the tool on an entire directory, you can run the following analogous commands:: - - clang-format modified-c-directory/*.c modified-c-directory/*.h - clang-format -i modified-c-directory/*.c modified-c-directory/*.h - -Do note that this tool is best-effort, meaning that it will try to correct as -many errors as possible, but it may not correct *all* of them. Thus, it is -recommended that you run ``cpplint`` to double check and make any other style -fixes manually. - -.. _contributing.code-formatting: - -Python (PEP8 / black) -~~~~~~~~~~~~~~~~~~~~~ - -pandas follows the `PEP8 `_ standard -and uses `Black `_ and -`Flake8 `_ to ensure a consistent code -format throughout the project. - -:ref:`Continuous Integration ` will run those tools and -report any stylistic errors in your code. Therefore, it is helpful before -submitting code to run the check yourself:: - - black pandas - git diff upstream/master -u -- "*.py" | flake8 --diff - -to auto-format your code. Additionally, many editors have plugins that will -apply ``black`` as you edit files. - -You should use a ``black`` version >= 19.10b0 as previous versions are not compatible -with the pandas codebase. - -If you wish to run these checks automatically, we encourage you to use -:ref:`pre-commits ` instead. - -One caveat about ``git diff upstream/master -u -- "*.py" | flake8 --diff``: this -command will catch any stylistic errors in your changes specifically, but -be beware it may not catch all of them. For example, if you delete the only -usage of an imported function, it is stylistically incorrect to import an -unused function. However, style-checking the diff will not catch this because -the actual import is not part of the diff. Thus, for completeness, you should -run this command, though it may take longer:: - - git diff upstream/master --name-only -- "*.py" | xargs -r flake8 - -Note that on OSX, the ``-r`` flag is not available, so you have to omit it and -run this slightly modified command:: - - git diff upstream/master --name-only -- "*.py" | xargs flake8 - -Windows does not support the ``xargs`` command (unless installed for example -via the `MinGW `__ toolchain), but one can imitate the -behaviour as follows:: - - for /f %i in ('git diff upstream/master --name-only -- "*.py"') do flake8 %i - -This will get all the files being changed by the PR (and ending with ``.py``), -and run ``flake8`` on them, one after the other. - -Note that these commands can be run analogously with ``black``. - -.. _contributing.import-formatting: - -Import formatting -~~~~~~~~~~~~~~~~~ -pandas uses `isort `__ to standardise import -formatting across the codebase. - -A guide to import layout as per pep8 can be found `here `__. - -A summary of our current import sections ( in order ): - -* Future -* Python Standard Library -* Third Party -* ``pandas._libs``, ``pandas.compat``, ``pandas.util._*``, ``pandas.errors`` (largely not dependent on ``pandas.core``) -* ``pandas.core.dtypes`` (largely not dependent on the rest of ``pandas.core``) -* Rest of ``pandas.core.*`` -* Non-core ``pandas.io``, ``pandas.plotting``, ``pandas.tseries`` -* Local application/library specific imports - -Imports are alphabetically sorted within these sections. - -As part of :ref:`Continuous Integration ` checks we run:: - - isort --recursive --check-only pandas - -to check that imports are correctly formatted as per the `setup.cfg`. - -If you see output like the below in :ref:`Continuous Integration ` checks: - -.. code-block:: shell - - Check import format using isort - ERROR: /home/travis/build/pandas-dev/pandas/pandas/io/pytables.py Imports are incorrectly sorted - Check import format using isort DONE - The command "ci/code_checks.sh" exited with 1 - -You should run:: - - isort pandas/io/pytables.py - -to automatically format imports correctly. This will modify your local copy of the files. - -The `--recursive` flag can be passed to sort all files in a directory. - -Alternatively, you can run a command similar to what was suggested for ``black`` and ``flake8`` :ref:`right above `:: - - git diff upstream/master --name-only -- "*.py" | xargs -r isort - -Where similar caveats apply if you are on OSX or Windows. - -You can then verify the changes look ok, then git :ref:`commit ` and :ref:`push `. - -.. _contributing.pre-commit: - -Pre-commit -~~~~~~~~~~ - -You can run many of these styling checks manually as we have described above. However, -we encourage you to use `pre-commit hooks `_ instead -to automatically run ``black``, ``flake8``, ``isort`` when you make a git commit. This -can be done by installing ``pre-commit``:: - - pip install pre-commit - -and then running:: - - pre-commit install - -from the root of the pandas repository. Now all of the styling checks will be -run each time you commit changes without your needing to run each one manually. -In addition, using this pre-commit hook will also allow you to more easily -remain up-to-date with our code checks as they change. - -Note that if needed, you can skip these checks with ``git commit --no-verify``. - -Backwards compatibility -~~~~~~~~~~~~~~~~~~~~~~~ - -Please try to maintain backward compatibility. pandas has lots of users with lots of -existing code, so don't break it if at all possible. If you think breakage is required, -clearly state why as part of the pull request. Also, be careful when changing method -signatures and add deprecation warnings where needed. Also, add the deprecated sphinx -directive to the deprecated functions or methods. - -If a function with the same arguments as the one being deprecated exist, you can use -the ``pandas.util._decorators.deprecate``: - -.. code-block:: python - - from pandas.util._decorators import deprecate - - deprecate('old_func', 'new_func', '1.1.0') - -Otherwise, you need to do it manually: - -.. code-block:: python - - import warnings - - - def old_func(): - """Summary of the function. - - .. deprecated:: 1.1.0 - Use new_func instead. - """ - warnings.warn('Use new_func instead.', FutureWarning, stacklevel=2) - new_func() - - - def new_func(): - pass - -You'll also need to - -1. Write a new test that asserts a warning is issued when calling with the deprecated argument -2. Update all of pandas existing tests and code to use the new argument - -See :ref:`contributing.warnings` for more. - -.. _contributing.type_hints: - -Type hints ----------- - -pandas strongly encourages the use of :pep:`484` style type hints. New development should contain type hints and pull requests to annotate existing code are accepted as well! - -Style guidelines -~~~~~~~~~~~~~~~~ - -Types imports should follow the ``from typing import ...`` convention. So rather than - -.. code-block:: python - - import typing - - primes: typing.List[int] = [] - -You should write - -.. code-block:: python - - from typing import List, Optional, Union - - primes: List[int] = [] - -``Optional`` should be used where applicable, so instead of - -.. code-block:: python - - maybe_primes: List[Union[int, None]] = [] - -You should write - -.. code-block:: python - - maybe_primes: List[Optional[int]] = [] - -In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 `_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like - -.. code-block:: python - - class SomeClass1: - str = None - -The appropriate way to annotate this would be as follows - -.. code-block:: python - - str_type = str - - class SomeClass2: - str: str_type = None - -In some cases you may be tempted to use ``cast`` from the typing module when you know better than the analyzer. This occurs particularly when using custom inference functions. For example - -.. code-block:: python - - from typing import cast - - from pandas.core.dtypes.common import is_number - - def cannot_infer_bad(obj: Union[str, int, float]): - - if is_number(obj): - ... - else: # Reasonably only str objects would reach this but... - obj = cast(str, obj) # Mypy complains without this! - return obj.upper() - -The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 `_. While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable - -.. code-block:: python - - def cannot_infer_good(obj: Union[str, int, float]): - - if isinstance(obj, str): - return obj.upper() - else: - ... - -With custom types and inference this is not always possible so exceptions are made, but every effort should be exhausted to avoid ``cast`` before going down such paths. - -pandas-specific types -~~~~~~~~~~~~~~~~~~~~~ - -Commonly used types specific to pandas will appear in `pandas._typing `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. - -For example, quite a few functions in pandas accept a ``dtype`` argument. This can be expressed as a string like ``"object"``, a ``numpy.dtype`` like ``np.int64`` or even a pandas ``ExtensionDtype`` like ``pd.CategoricalDtype``. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module - -.. code-block:: python - - from pandas._typing import Dtype - - def as_type(dtype: Dtype) -> ...: - ... - -This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like `axis`. Development of this module is active so be sure to refer to the source for the most up to date list of available types. - -Validating type hints -~~~~~~~~~~~~~~~~~~~~~ - -pandas uses `mypy `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running - -.. code-block:: shell - - mypy pandas - -.. _contributing.ci: - -Testing with continuous integration ------------------------------------ - -The pandas test suite will run automatically on `Travis-CI `__ and -`Azure Pipelines `__ -continuous integration services, once your pull request is submitted. -However, if you wish to run the test suite on a branch prior to submitting the pull request, -then the continuous integration services need to be hooked to your GitHub repository. Instructions are here -for `Travis-CI `__ and -`Azure Pipelines `__. - -A pull-request will be considered for merging when you have an all 'green' build. If any tests are failing, -then you will get a red 'X', where you can click through to see the individual failed tests. -This is an example of a green build. - -.. image:: ../_static/ci.png - -.. note:: - - Each time you push to *your* fork, a *new* run of the tests will be triggered on the CI. - You can enable the auto-cancel feature, which removes any non-currently-running tests for that same pull-request, for - `Travis-CI here `__. - -.. _contributing.tdd: - - -Test-driven development/code writing ------------------------------------- - -pandas is serious about testing and strongly encourages contributors to embrace -`test-driven development (TDD) `_. -This development process "relies on the repetition of a very short development cycle: -first the developer writes an (initially failing) automated test case that defines a desired -improvement or new function, then produces the minimum amount of code to pass that test." -So, before actually writing any code, you should write your tests. Often the test can be -taken from the original GitHub issue. However, it is always worth considering additional -use cases and writing corresponding tests. - -Adding tests is one of the most common requests after code is pushed to pandas. Therefore, -it is worth getting in the habit of writing tests ahead of time so this is never an issue. - -Like many packages, pandas uses `pytest -`_ and the convenient -extensions in `numpy.testing -`_. - -.. note:: - - The earliest supported pytest version is 5.0.1. - -Writing tests -~~~~~~~~~~~~~ - -All tests should go into the ``tests`` subdirectory of the specific package. -This folder contains many current examples of tests, and we suggest looking to these for -inspiration. If your test requires working with files or -network connectivity, there is more information on the `testing page -`_ of the wiki. - -The ``pandas._testing`` module has many special ``assert`` functions that -make it easier to make statements about whether Series or DataFrame objects are -equivalent. The easiest way to verify that your code is correct is to -explicitly construct the result you expect, then compare the actual result to -the expected correct result:: - - def test_pivot(self): - data = { - 'index' : ['A', 'B', 'C', 'C', 'B', 'A'], - 'columns' : ['One', 'One', 'One', 'Two', 'Two', 'Two'], - 'values' : [1., 2., 3., 3., 2., 1.] - } - - frame = DataFrame(data) - pivoted = frame.pivot(index='index', columns='columns', values='values') - - expected = DataFrame({ - 'One' : {'A' : 1., 'B' : 2., 'C' : 3.}, - 'Two' : {'A' : 1., 'B' : 2., 'C' : 3.} - }) - - assert_frame_equal(pivoted, expected) - -Please remember to add the Github Issue Number as a comment to a new test. -E.g. "# brief comment, see GH#28907" - -Transitioning to ``pytest`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -pandas existing test structure is *mostly* class-based, meaning that you will typically find tests wrapped in a class. - -.. code-block:: python - - class TestReallyCoolFeature: - pass - -Going forward, we are moving to a more *functional* style using the `pytest `__ framework, which offers a richer testing -framework that will facilitate testing and developing. Thus, instead of writing test classes, we will write test functions like this: - -.. code-block:: python - - def test_really_cool_feature(): - pass - -Using ``pytest`` -~~~~~~~~~~~~~~~~ - -Here is an example of a self-contained set of tests that illustrate multiple features that we like to use. - -* functional style: tests are like ``test_*`` and *only* take arguments that are either fixtures or parameters -* ``pytest.mark`` can be used to set metadata on test functions, e.g. ``skip`` or ``xfail``. -* using ``parametrize``: allow testing of multiple cases -* to set a mark on a parameter, ``pytest.param(..., marks=...)`` syntax should be used -* ``fixture``, code for object construction, on a per-test basis -* using bare ``assert`` for scalars and truth-testing -* ``tm.assert_series_equal`` (and its counter part ``tm.assert_frame_equal``), for pandas object comparisons. -* the typical pattern of constructing an ``expected`` and comparing versus the ``result`` - -We would name this file ``test_cool_feature.py`` and put in an appropriate place in the ``pandas/tests/`` structure. - -.. code-block:: python - - import pytest - import numpy as np - import pandas as pd - - - @pytest.mark.parametrize('dtype', ['int8', 'int16', 'int32', 'int64']) - def test_dtypes(dtype): - assert str(np.dtype(dtype)) == dtype - - - @pytest.mark.parametrize( - 'dtype', ['float32', pytest.param('int16', marks=pytest.mark.skip), - pytest.param('int32', marks=pytest.mark.xfail( - reason='to show how it works'))]) - def test_mark(dtype): - assert str(np.dtype(dtype)) == 'float32' - - - @pytest.fixture - def series(): - return pd.Series([1, 2, 3]) - - - @pytest.fixture(params=['int8', 'int16', 'int32', 'int64']) - def dtype(request): - return request.param - - - def test_series(series, dtype): - result = series.astype(dtype) - assert result.dtype == dtype - - expected = pd.Series([1, 2, 3], dtype=dtype) - tm.assert_series_equal(result, expected) - - -A test run of this yields - -.. code-block:: shell - - ((pandas) bash-3.2$ pytest test_cool_feature.py -v - =========================== test session starts =========================== - platform darwin -- Python 3.6.2, pytest-3.6.0, py-1.4.31, pluggy-0.4.0 - collected 11 items - - tester.py::test_dtypes[int8] PASSED - tester.py::test_dtypes[int16] PASSED - tester.py::test_dtypes[int32] PASSED - tester.py::test_dtypes[int64] PASSED - tester.py::test_mark[float32] PASSED - tester.py::test_mark[int16] SKIPPED - tester.py::test_mark[int32] xfail - tester.py::test_series[int8] PASSED - tester.py::test_series[int16] PASSED - tester.py::test_series[int32] PASSED - tester.py::test_series[int64] PASSED - -Tests that we have ``parametrized`` are now accessible via the test name, for example we could run these with ``-k int8`` to sub-select *only* those tests which match ``int8``. - - -.. code-block:: shell - - ((pandas) bash-3.2$ pytest test_cool_feature.py -v -k int8 - =========================== test session starts =========================== - platform darwin -- Python 3.6.2, pytest-3.6.0, py-1.4.31, pluggy-0.4.0 - collected 11 items - - test_cool_feature.py::test_dtypes[int8] PASSED - test_cool_feature.py::test_series[int8] PASSED - - -.. _using-hypothesis: - -Using ``hypothesis`` -~~~~~~~~~~~~~~~~~~~~ - -Hypothesis is a library for property-based testing. Instead of explicitly -parametrizing a test, you can describe *all* valid inputs and let Hypothesis -try to find a failing input. Even better, no matter how many random examples -it tries, Hypothesis always reports a single minimal counterexample to your -assertions - often an example that you would never have thought to test. - -See `Getting Started with Hypothesis `_ -for more of an introduction, then `refer to the Hypothesis documentation -for details `_. - -.. code-block:: python - - import json - from hypothesis import given, strategies as st - - any_json_value = st.deferred(lambda: st.one_of( - st.none(), st.booleans(), st.floats(allow_nan=False), st.text(), - st.lists(any_json_value), st.dictionaries(st.text(), any_json_value) - )) - - - @given(value=any_json_value) - def test_json_roundtrip(value): - result = json.loads(json.dumps(value)) - assert value == result - -This test shows off several useful features of Hypothesis, as well as -demonstrating a good use-case: checking properties that should hold over -a large or complicated domain of inputs. - -To keep the Pandas test suite running quickly, parametrized tests are -preferred if the inputs or logic are simple, with Hypothesis tests reserved -for cases with complex logic or where there are too many combinations of -options or subtle interactions to test (or think of!) all of them. - -.. _contributing.warnings: - -Testing warnings -~~~~~~~~~~~~~~~~ - -By default, one of pandas CI workers will fail if any unhandled warnings are emitted. - -If your change involves checking that a warning is actually emitted, use -``tm.assert_produces_warning(ExpectedWarning)``. - - -.. code-block:: python - - import pandas._testing as tm - - - df = pd.DataFrame() - with tm.assert_produces_warning(FutureWarning): - df.some_operation() - -We prefer this to the ``pytest.warns`` context manager because ours checks that the warning's -stacklevel is set correctly. The stacklevel is what ensure the *user's* file name and line number -is printed in the warning, rather than something internal to pandas. It represents the number of -function calls from user code (e.g. ``df.some_operation()``) to the function that actually emits -the warning. Our linter will fail the build if you use ``pytest.warns`` in a test. - -If you have a test that would emit a warning, but you aren't actually testing the -warning itself (say because it's going to be removed in the future, or because we're -matching a 3rd-party library's behavior), then use ``pytest.mark.filterwarnings`` to -ignore the error. - -.. code-block:: python - - @pytest.mark.filterwarnings("ignore:msg:category") - def test_thing(self): - ... - -If the test generates a warning of class ``category`` whose message starts -with ``msg``, the warning will be ignored and the test will pass. - -If you need finer-grained control, you can use Python's usual -`warnings module `__ -to control whether a warning is ignored / raised at different places within -a single test. - -.. code-block:: python - - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - # Or use warnings.filterwarnings(...) - -Alternatively, consider breaking up the unit test. - - -Running the test suite ----------------------- - -The tests can then be run directly inside your Git clone (without having to -install pandas) by typing:: - - pytest pandas - -The tests suite is exhaustive and takes around 20 minutes to run. Often it is -worth running only a subset of tests first around your changes before running the -entire suite. - -The easiest way to do this is with:: - - pytest pandas/path/to/test.py -k regex_matching_test_name - -Or with one of the following constructs:: - - pytest pandas/tests/[test-module].py - pytest pandas/tests/[test-module].py::[TestClass] - pytest pandas/tests/[test-module].py::[TestClass]::[test_method] - -Using `pytest-xdist `_, one can -speed up local testing on multicore machines. To use this feature, you will -need to install `pytest-xdist` via:: - - pip install pytest-xdist - -Two scripts are provided to assist with this. These scripts distribute -testing across 4 threads. - -On Unix variants, one can type:: - - test_fast.sh - -On Windows, one can type:: - - test_fast.bat - -This can significantly reduce the time it takes to locally run tests before -submitting a pull request. - -For more, see the `pytest `_ documentation. - -Furthermore one can run - -.. code-block:: python - - pd.test() - -with an imported pandas to run tests similarly. - -Running the performance test suite ----------------------------------- - -Performance matters and it is worth considering whether your code has introduced -performance regressions. pandas is in the process of migrating to -`asv benchmarks `__ -to enable easy monitoring of the performance of critical pandas operations. -These benchmarks are all found in the ``pandas/asv_bench`` directory, and the -test results can be found `here `__. - -To use all features of asv, you will need either ``conda`` or -``virtualenv``. For more details please check the `asv installation -webpage `_. - -To install asv:: - - pip install git+https://github.com/spacetelescope/asv - -If you need to run a benchmark, change your directory to ``asv_bench/`` and run:: - - asv continuous -f 1.1 upstream/master HEAD - -You can replace ``HEAD`` with the name of the branch you are working on, -and report benchmarks that changed by more than 10%. -The command uses ``conda`` by default for creating the benchmark -environments. If you want to use virtualenv instead, write:: - - asv continuous -f 1.1 -E virtualenv upstream/master HEAD - -The ``-E virtualenv`` option should be added to all ``asv`` commands -that run benchmarks. The default value is defined in ``asv.conf.json``. - -Running the full test suite can take up to one hour and use up to 3GB of RAM. -Usually it is sufficient to paste only a subset of the results into the pull -request to show that the committed changes do not cause unexpected performance -regressions. You can run specific benchmarks using the ``-b`` flag, which -takes a regular expression. For example, this will only run tests from a -``pandas/asv_bench/benchmarks/groupby.py`` file:: - - asv continuous -f 1.1 upstream/master HEAD -b ^groupby - -If you want to only run a specific group of tests from a file, you can do it -using ``.`` as a separator. For example:: - - asv continuous -f 1.1 upstream/master HEAD -b groupby.GroupByMethods - -will only run the ``GroupByMethods`` benchmark defined in ``groupby.py``. - -You can also run the benchmark suite using the version of ``pandas`` -already installed in your current Python environment. This can be -useful if you do not have virtualenv or conda, or are using the -``setup.py develop`` approach discussed above; for the in-place build -you need to set ``PYTHONPATH``, e.g. -``PYTHONPATH="$PWD/.." asv [remaining arguments]``. -You can run benchmarks using an existing Python -environment by:: - - asv run -e -E existing - -or, to use a specific Python interpreter,:: - - asv run -e -E existing:python3.6 - -This will display stderr from the benchmarks, and use your local -``python`` that comes from your ``$PATH``. - -Information on how to write a benchmark and how to use asv can be found in the -`asv documentation `_. - -Documenting your code ---------------------- - -Changes should be reflected in the release notes located in ``doc/source/whatsnew/vx.y.z.rst``. -This file contains an ongoing change log for each release. Add an entry to this file to -document your fix, enhancement or (unavoidable) breaking change. Make sure to include the -GitHub issue number when adding your entry (using ``:issue:`1234``` where ``1234`` is the -issue/pull request number). - -If your code is an enhancement, it is most likely necessary to add usage -examples to the existing documentation. This can be done following the section -regarding documentation :ref:`above `. -Further, to let users know when this feature was added, the ``versionadded`` -directive is used. The sphinx syntax for that is: - -.. code-block:: rst - - .. versionadded:: 1.1.0 - -This will put the text *New in version 1.1.0* wherever you put the sphinx -directive. This should also be put in the docstring when adding a new function -or method (`example `__) -or a new keyword argument (`example `__). - Contributing your changes to pandas ===================================== @@ -1414,30 +201,10 @@ Doing 'git status' again should give something like:: # modified: /relative/path/to/file-you-added.py # -Finally, commit your changes to your local repository with an explanatory message. pandas -uses a convention for commit message prefixes and layout. Here are -some common prefixes along with general guidelines for when to use them: - -* ENH: Enhancement, new functionality -* BUG: Bug fix -* DOC: Additions/updates to documentation -* TST: Additions/updates to tests -* BLD: Updates to the build process/scripts -* PERF: Performance improvement -* TYP: Type annotations -* CLN: Code cleanup - -The following defines how a commit message should be structured. Please reference the -relevant GitHub issues in your commit message using GH1234 or #1234. Either style -is fine, but the former is generally preferred: +Finally, commit your changes to your local repository with an explanatory commit +message:: -* a subject line with `< 80` chars. -* One blank line. -* Optionally, a commit message body. - -Now you can commit your changes in your local repository:: - - git commit -m + git commit -m "your commit message goes here" .. _contributing.push-code: @@ -1476,22 +243,34 @@ double check your branch changes against the branch it was based on: #. Navigate to your repository on GitHub -- https://github.com/your-user-name/pandas #. Click on ``Branches`` #. Click on the ``Compare`` button for your feature branch -#. Select the ``base`` and ``compare`` branches, if necessary. This will be ``master`` and +#. Select the ``base`` and ``compare`` branches, if necessary. This will be ``main`` and ``shiny-new-feature``, respectively. Finally, make the pull request ------------------------------ -If everything looks good, you are ready to make a pull request. A pull request is how +If everything looks good, you are ready to make a pull request. A pull request is how code from a local repository becomes available to the GitHub community and can be looked -at and eventually merged into the master version. This pull request and its associated -changes will eventually be committed to the master branch and available in the next -release. To submit a pull request: +at and eventually merged into the main version. This pull request and its associated +changes will eventually be committed to the main branch and available in the next +release. To submit a pull request: #. Navigate to your repository on GitHub -#. Click on the ``Pull Request`` button +#. Click on the ``Compare & pull request`` button #. You can then click on ``Commits`` and ``Files Changed`` to make sure everything looks okay one last time +#. Write a descriptive title that includes prefixes. pandas uses a convention for title + prefixes. Here are some common ones along with general guidelines for when to use them: + + * ENH: Enhancement, new functionality + * BUG: Bug fix + * DOC: Additions/updates to documentation + * TST: Additions/updates to tests + * BLD: Updates to the build process/scripts + * PERF: Performance improvement + * TYP: Type annotations + * CLN: Code cleanup + #. Write a description of your changes in the ``Preview Discussion`` tab #. Click ``Send Pull Request``. @@ -1511,17 +290,17 @@ automatically updated. Pushing them to GitHub again is done by:: git push origin shiny-new-feature This will automatically update your pull request with the latest code and restart the -:ref:`Continuous Integration ` tests. +:any:`Continuous Integration ` tests. Another reason you might need to update your pull request is to solve conflicts -with changes that have been merged into the master branch since you opened your +with changes that have been merged into the main branch since you opened your pull request. -To do this, you need to "merge upstream master" in your branch:: +To do this, you need to "merge upstream main" in your branch:: git checkout shiny-new-feature git fetch upstream - git merge upstream/master + git merge upstream/main If there are no conflicts (or they could be fixed automatically), a file with a default commit message will open, and you can simply save and quit this file. @@ -1533,7 +312,7 @@ Once the conflicts are merged and the files where the conflicts were solved are added, you can run ``git commit`` to save those fixes. If you have uncommitted changes at the moment you want to update the branch with -master, you will need to ``stash`` them prior to updating (see the +main, you will need to ``stash`` them prior to updating (see the `stash docs `__). This will effectively store your changes and they can be reapplied after updating. @@ -1542,30 +321,35 @@ request by pushing to the branch on GitHub:: git push origin shiny-new-feature -Delete your merged branch (optional) ------------------------------------- +Autofixing formatting errors +---------------------------- -Once your feature branch is accepted into upstream, you'll probably want to get rid of -the branch. First, merge upstream master into your branch so git knows it is safe to -delete your branch:: +We use several styling checks (e.g. ``black``, ``flake8``, ``isort``) which are run after +you make a pull request. - git fetch upstream - git checkout master - git merge upstream/master +To automatically fix formatting errors on each commit you make, you can +set up pre-commit yourself. First, create a Python :ref:`environment +` and then set up :ref:`pre-commit `. -Then you can do:: +.. _contributing.update-dev: - git branch -d shiny-new-feature +Updating the development environment +------------------------------------ -Make sure you use a lower-case ``-d``, or else git won't warn you if your feature -branch has not actually been merged. +After updating your branch to merge in main from upstream, you may need to update +your development environment to reflect any changes to the various packages that +are used during development. -The branch will still exist on GitHub, so to delete it there do:: +If using :ref:`mamba `, do:: - git push origin --delete shiny-new-feature + mamba deactivate + mamba env update -f environment.yml + mamba activate pandas-dev -.. _Gitter: https://gitter.im/pydata/pandas +If using :ref:`pip ` , do:: + # activate the virtual environment based on your platform + pythom -m pip install --upgrade -r requirements-dev.txt Tips for a successful pull request ================================== diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst new file mode 100644 index 0000000000000..9178032c31371 --- /dev/null +++ b/doc/source/development/contributing_codebase.rst @@ -0,0 +1,1008 @@ +.. _contributing_codebase: + +{{ header }} + +============================= +Contributing to the code base +============================= + +.. contents:: Table of Contents: + :local: + +Code standards +-------------- + +Writing good code is not just about what you write. It is also about *how* you +write it. During :ref:`Continuous Integration ` testing, several +tools will be run to check your code for stylistic errors. +Generating any warnings will cause the test to fail. +Thus, good style is a requirement for submitting code to pandas. + +There is a tool in pandas to help contributors verify their changes before +contributing them to the project:: + + ./ci/code_checks.sh + +The script validates the doctests, formatting in docstrings, and +imported modules. It is possible to run the checks independently by using the +parameters ``docstrings``, ``code``, and ``doctests`` +(e.g. ``./ci/code_checks.sh doctests``). + +In addition, because a lot of people use our library, it is important that we +do not make sudden changes to the code that could have the potential to break +a lot of user code as a result, that is, we need it to be as *backwards compatible* +as possible to avoid mass breakages. + +In addition to ``./ci/code_checks.sh``, some extra checks (including static type +checking) are run by ``pre-commit`` - see :ref:`here ` +for how to run them. + +.. _contributing.pre-commit: + +Pre-commit +---------- + +Additionally, :ref:`Continuous Integration ` will run code formatting checks +like ``black``, ``ruff``, +``isort``, and ``cpplint`` and more using `pre-commit hooks `_ +Any warnings from these checks will cause the :ref:`Continuous Integration ` to fail; therefore, +it is helpful to run the check yourself before submitting code. This +can be done by installing ``pre-commit``:: + + pip install pre-commit + +and then running:: + + pre-commit install + +from the root of the pandas repository. Now all of the styling checks will be +run each time you commit changes without your needing to run each one manually. +In addition, using ``pre-commit`` will also allow you to more easily +remain up-to-date with our code checks as they change. + +Note that if needed, you can skip these checks with ``git commit --no-verify``. + +If you don't want to use ``pre-commit`` as part of your workflow, you can still use it +to run its checks with:: + + pre-commit run --files + +without needing to have done ``pre-commit install`` beforehand. + +If you want to run checks on all recently committed files on upstream/main you can use:: + + pre-commit run --from-ref=upstream/main --to-ref=HEAD --all-files + +without needing to have done ``pre-commit install`` beforehand. + +.. note:: + + You may want to periodically run ``pre-commit gc``, to clean up repos + which are no longer used. + +.. note:: + + If you have conflicting installations of ``virtualenv``, then you may get an + error - see `here `_. + + Also, due to a `bug in virtualenv `_, + you may run into issues if you're using conda. To solve this, you can downgrade + ``virtualenv`` to version ``20.0.33``. + +.. note:: + + If you have recently merged in main from the upstream branch, some of the + dependencies used by ``pre-commit`` may have changed. Make sure to + :ref:`update your development environment `. + +Optional dependencies +--------------------- + +Optional dependencies (e.g. matplotlib) should be imported with the private helper +``pandas.compat._optional.import_optional_dependency``. This ensures a +consistent error message when the dependency is not met. + +All methods using an optional dependency should include a test asserting that an +``ImportError`` is raised when the optional dependency is not found. This test +should be skipped if the library is present. + +All optional dependencies should be documented in +:ref:`install.optional_dependencies` and the minimum required version should be +set in the ``pandas.compat._optional.VERSIONS`` dict. + +Backwards compatibility +----------------------- + +Please try to maintain backward compatibility. pandas has lots of users with lots of +existing code, so don't break it if at all possible. If you think breakage is required, +clearly state why as part of the pull request. Also, be careful when changing method +signatures and add deprecation warnings where needed. Also, add the deprecated sphinx +directive to the deprecated functions or methods. + +If a function with the same arguments as the one being deprecated exist, you can use +the ``pandas.util._decorators.deprecate``: + +.. code-block:: python + + from pandas.util._decorators import deprecate + + deprecate('old_func', 'new_func', '1.1.0') + +Otherwise, you need to do it manually: + +.. code-block:: python + + import warnings + from pandas.util._exceptions import find_stack_level + + + def old_func(): + """Summary of the function. + + .. deprecated:: 1.1.0 + Use new_func instead. + """ + warnings.warn( + 'Use new_func instead.', + FutureWarning, + stacklevel=find_stack_level(), + ) + new_func() + + + def new_func(): + pass + +You'll also need to + +1. Write a new test that asserts a warning is issued when calling with the deprecated argument +2. Update all of pandas existing tests and code to use the new argument + +See :ref:`contributing.warnings` for more. + +.. _contributing.type_hints: + +Type hints +---------- + +pandas strongly encourages the use of :pep:`484` style type hints. New development should contain type hints and pull requests to annotate existing code are accepted as well! + +Style guidelines +~~~~~~~~~~~~~~~~ + +Type imports should follow the ``from typing import ...`` convention. Some types do not need to be imported since :pep:`585` some builtin constructs, such as ``list`` and ``tuple``, can directly be used for type annotations. So rather than + +.. code-block:: python + + import typing + + primes: typing.List[int] = [] + +You should write + +.. code-block:: python + + primes: list[int] = [] + +``Optional`` should be avoided in favor of the shorter ``| None``, so instead of + +.. code-block:: python + + from typing import Union + + maybe_primes: list[Union[int, None]] = [] + +or + +.. code-block:: python + + from typing import Optional + + maybe_primes: list[Optional[int]] = [] + +You should write + +.. code-block:: python + + from __future__ import annotations # noqa: F404 + + maybe_primes: list[int | None] = [] + +In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 `_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like + +.. code-block:: python + + class SomeClass1: + str = None + +The appropriate way to annotate this would be as follows + +.. code-block:: python + + str_type = str + + class SomeClass2: + str: str_type = None + +In some cases you may be tempted to use ``cast`` from the typing module when you know better than the analyzer. This occurs particularly when using custom inference functions. For example + +.. code-block:: python + + from typing import cast + + from pandas.core.dtypes.common import is_number + + def cannot_infer_bad(obj: Union[str, int, float]): + + if is_number(obj): + ... + else: # Reasonably only str objects would reach this but... + obj = cast(str, obj) # Mypy complains without this! + return obj.upper() + +The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 `_. While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable + +.. code-block:: python + + def cannot_infer_good(obj: Union[str, int, float]): + + if isinstance(obj, str): + return obj.upper() + else: + ... + +With custom types and inference this is not always possible so exceptions are made, but every effort should be exhausted to avoid ``cast`` before going down such paths. + +pandas-specific types +~~~~~~~~~~~~~~~~~~~~~ + +Commonly used types specific to pandas will appear in `pandas._typing `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. + +For example, quite a few functions in pandas accept a ``dtype`` argument. This can be expressed as a string like ``"object"``, a ``numpy.dtype`` like ``np.int64`` or even a pandas ``ExtensionDtype`` like ``pd.CategoricalDtype``. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module + +.. code-block:: python + + from pandas._typing import Dtype + + def as_type(dtype: Dtype) -> ...: + ... + +This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like ``axis``. Development of this module is active so be sure to refer to the source for the most up to date list of available types. + +Validating type hints +~~~~~~~~~~~~~~~~~~~~~ + +pandas uses `mypy `_ and `pyright `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are consistent by running + +.. code-block:: shell + + pre-commit run --hook-stage manual --all-files mypy + pre-commit run --hook-stage manual --all-files pyright + pre-commit run --hook-stage manual --all-files pyright_reportGeneralTypeIssues + # the following might fail if the installed pandas version does not correspond to your local git version + pre-commit run --hook-stage manual --all-files stubtest + +in your python environment. + +.. warning:: + + * Please be aware that the above commands will use the current python environment. If your python packages are older/newer than those installed by the pandas CI, the above commands might fail. This is often the case when the ``mypy`` or ``numpy`` versions do not match. Please see :ref:`how to setup the python environment ` or select a `recently succeeded workflow `_, select the "Docstring validation, typing, and other manual pre-commit hooks" job, then click on "Set up Conda" and "Environment info" to see which versions the pandas CI installs. + +.. _contributing.ci: + +Testing type hints in code using pandas +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + * Pandas is not yet a py.typed library (:pep:`561`)! + The primary purpose of locally declaring pandas as a py.typed library is to test and + improve the pandas-builtin type annotations. + +Until pandas becomes a py.typed library, it is possible to easily experiment with the type +annotations shipped with pandas by creating an empty file named "py.typed" in the pandas +installation folder: + +.. code-block:: none + + python -c "import pandas; import pathlib; (pathlib.Path(pandas.__path__[0]) / 'py.typed').touch()" + +The existence of the py.typed file signals to type checkers that pandas is already a py.typed +library. This makes type checkers aware of the type annotations shipped with pandas. + +Testing with continuous integration +----------------------------------- + +The pandas test suite will run automatically on `GitHub Actions `__ +continuous integration services, once your pull request is submitted. +However, if you wish to run the test suite on a branch prior to submitting the pull request, +then the continuous integration services need to be hooked to your GitHub repository. Instructions are here +for `GitHub Actions `__. + +A pull-request will be considered for merging when you have an all 'green' build. If any tests are failing, +then you will get a red 'X', where you can click through to see the individual failed tests. +This is an example of a green build. + +.. image:: ../_static/ci.png + +.. _contributing.tdd: + + +Test-driven development +----------------------- + +pandas is serious about testing and strongly encourages contributors to embrace +`test-driven development (TDD) `_. +This development process "relies on the repetition of a very short development cycle: +first the developer writes an (initially failing) automated test case that defines a desired +improvement or new function, then produces the minimum amount of code to pass that test." +So, before actually writing any code, you should write your tests. Often the test can be +taken from the original GitHub issue. However, it is always worth considering additional +use cases and writing corresponding tests. + +Adding tests is one of the most common requests after code is pushed to pandas. Therefore, +it is worth getting in the habit of writing tests ahead of time so this is never an issue. + +Writing tests +~~~~~~~~~~~~~ + +All tests should go into the ``tests`` subdirectory of the specific package. +This folder contains many current examples of tests, and we suggest looking to these for +inspiration. + +As a general tip, you can use the search functionality in your integrated development +environment (IDE) or the git grep command in a terminal to find test files in which the method +is called. If you are unsure of the best location to put your test, take your best guess, +but note that reviewers may request that you move the test to a different location. + +To use git grep, you can run the following command in a terminal: + +``git grep "function_name("`` + +This will search through all files in your repository for the text ``function_name(``. +This can be a useful way to quickly locate the function in the +codebase and determine the best location to add a test for it. + +Ideally, there should be one, and only one, obvious place for a test to reside. +Until we reach that ideal, these are some rules of thumb for where a test should +be located. + +1. Does your test depend only on code in ``pd._libs.tslibs``? + This test likely belongs in one of: + + - tests.tslibs + + .. note:: + + No file in ``tests.tslibs`` should import from any pandas modules + outside of ``pd._libs.tslibs`` + + - tests.scalar + - tests.tseries.offsets + +2. Does your test depend only on code in pd._libs? + This test likely belongs in one of: + + - tests.libs + - tests.groupby.test_libgroupby + +3. Is your test for an arithmetic or comparison method? + This test likely belongs in one of: + + - tests.arithmetic + + .. note:: + + These are intended for tests that can be shared to test the behavior + of DataFrame/Series/Index/ExtensionArray using the ``box_with_array`` + fixture. + + - tests.frame.test_arithmetic + - tests.series.test_arithmetic + +4. Is your test for a reduction method (min, max, sum, prod, ...)? + This test likely belongs in one of: + + - tests.reductions + + .. note:: + + These are intended for tests that can be shared to test the behavior + of DataFrame/Series/Index/ExtensionArray. + + - tests.frame.test_reductions + - tests.series.test_reductions + - tests.test_nanops + +5. Is your test for an indexing method? + This is the most difficult case for deciding where a test belongs, because + there are many of these tests, and many of them test more than one method + (e.g. both ``Series.__getitem__`` and ``Series.loc.__getitem__``) + + A) Is the test specifically testing an Index method (e.g. ``Index.get_loc``, + ``Index.get_indexer``)? + This test likely belongs in one of: + + - tests.indexes.test_indexing + - tests.indexes.fooindex.test_indexing + + Within that files there should be a method-specific test class e.g. + ``TestGetLoc``. + + In most cases, neither ``Series`` nor ``DataFrame`` objects should be + needed in these tests. + + B) Is the test for a Series or DataFrame indexing method *other* than + ``__getitem__`` or ``__setitem__``, e.g. ``xs``, ``where``, ``take``, + ``mask``, ``lookup``, or ``insert``? + This test likely belongs in one of: + + - tests.frame.indexing.test_methodname + - tests.series.indexing.test_methodname + + C) Is the test for any of ``loc``, ``iloc``, ``at``, or ``iat``? + This test likely belongs in one of: + + - tests.indexing.test_loc + - tests.indexing.test_iloc + - tests.indexing.test_at + - tests.indexing.test_iat + + Within the appropriate file, test classes correspond to either types of + indexers (e.g. ``TestLocBooleanMask``) or major use cases + (e.g. ``TestLocSetitemWithExpansion``). + + See the note in section D) about tests that test multiple indexing methods. + + D) Is the test for ``Series.__getitem__``, ``Series.__setitem__``, + ``DataFrame.__getitem__``, or ``DataFrame.__setitem__``? + This test likely belongs in one of: + + - tests.series.test_getitem + - tests.series.test_setitem + - tests.frame.test_getitem + - tests.frame.test_setitem + + If many cases such a test may test multiple similar methods, e.g. + + .. code-block:: python + + import pandas as pd + import pandas._testing as tm + + def test_getitem_listlike_of_ints(): + ser = pd.Series(range(5)) + + result = ser[[3, 4]] + expected = pd.Series([2, 3]) + tm.assert_series_equal(result, expected) + + result = ser.loc[[3, 4]] + tm.assert_series_equal(result, expected) + + In cases like this, the test location should be based on the *underlying* + method being tested. Or in the case of a test for a bugfix, the location + of the actual bug. So in this example, we know that ``Series.__getitem__`` + calls ``Series.loc.__getitem__``, so this is *really* a test for + ``loc.__getitem__``. So this test belongs in ``tests.indexing.test_loc``. + +6. Is your test for a DataFrame or Series method? + + A) Is the method a plotting method? + This test likely belongs in one of: + + - tests.plotting + + B) Is the method an IO method? + This test likely belongs in one of: + + - tests.io + + C) Otherwise + This test likely belongs in one of: + + - tests.series.methods.test_mymethod + - tests.frame.methods.test_mymethod + + .. note:: + + If a test can be shared between DataFrame/Series using the + ``frame_or_series`` fixture, by convention it goes in the + ``tests.frame`` file. + +7. Is your test for an Index method, not depending on Series/DataFrame? + This test likely belongs in one of: + + - tests.indexes + +8) Is your test for one of the pandas-provided ExtensionArrays (``Categorical``, + ``DatetimeArray``, ``TimedeltaArray``, ``PeriodArray``, ``IntervalArray``, + ``PandasArray``, ``FloatArray``, ``BoolArray``, ``StringArray``)? + This test likely belongs in one of: + + - tests.arrays + +9) Is your test for *all* ExtensionArray subclasses (the "EA Interface")? + This test likely belongs in one of: + + - tests.extension + +Using ``pytest`` +~~~~~~~~~~~~~~~~ + +Test structure +^^^^^^^^^^^^^^ + +pandas existing test structure is *mostly* class-based, meaning that you will typically find tests wrapped in a class. + +.. code-block:: python + + class TestReallyCoolFeature: + def test_cool_feature_aspect(self): + pass + +We prefer a more *functional* style using the `pytest `__ framework, which offers a richer testing +framework that will facilitate testing and developing. Thus, instead of writing test classes, we will write test functions like this: + +.. code-block:: python + + def test_really_cool_feature(): + pass + +Preferred ``pytest`` idioms +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Functional tests named ``def test_*`` and *only* take arguments that are either fixtures or parameters. +* Use a bare ``assert`` for testing scalars and truth-testing +* Use ``tm.assert_series_equal(result, expected)`` and ``tm.assert_frame_equal(result, expected)`` for comparing :class:`Series` and :class:`DataFrame` results respectively. +* Use `@pytest.mark.parameterize `__ when testing multiple cases. +* Use `pytest.mark.xfail `__ when a test case is expected to fail. +* Use `pytest.mark.skip `__ when a test case is never expected to pass. +* Use `pytest.param `__ when a test case needs a particular mark. +* Use `@pytest.fixture `__ if multiple tests can share a setup object. + +.. warning:: + + Do not use ``pytest.xfail`` (which is different than ``pytest.mark.xfail``) since it immediately stops the + test and does not check if the test will fail. If this is the behavior you desire, use ``pytest.skip`` instead. + +If a test is known to fail but the manner in which it fails +is not meant to be captured, use ``pytest.mark.xfail`` It is common to use this method for a test that +exhibits buggy behavior or a non-implemented feature. If +the failing test has flaky behavior, use the argument ``strict=False``. This +will make it so pytest does not fail if the test happens to pass. + +Prefer the decorator ``@pytest.mark.xfail`` and the argument ``pytest.param`` +over usage within a test so that the test is appropriately marked during the +collection phase of pytest. For xfailing a test that involves multiple +parameters, a fixture, or a combination of these, it is only possible to +xfail during the testing phase. To do so, use the ``request`` fixture: + +.. code-block:: python + + def test_xfail(request): + mark = pytest.mark.xfail(raises=TypeError, reason="Indicate why here") + request.node.add_marker(mark) + +xfail is not to be used for tests involving failure due to invalid user arguments. +For these tests, we need to verify the correct exception type and error message +is being raised, using ``pytest.raises`` instead. + +.. _contributing.warnings: + +Testing a warning +^^^^^^^^^^^^^^^^^ + +Use ``tm.assert_produces_warning`` as a context manager to check that a block of code raises a warning. + +.. code-block:: python + + with tm.assert_produces_warning(DeprecationWarning): + pd.deprecated_function() + +If a warning should specifically not happen in a block of code, pass ``False`` into the context manager. + +.. code-block:: python + + with tm.assert_produces_warning(False): + pd.no_warning_function() + +If you have a test that would emit a warning, but you aren't actually testing the +warning itself (say because it's going to be removed in the future, or because we're +matching a 3rd-party library's behavior), then use ``pytest.mark.filterwarnings`` to +ignore the error. + +.. code-block:: python + + @pytest.mark.filterwarnings("ignore:msg:category") + def test_thing(self): + pass + +If you need finer-grained control, you can use Python's +`warnings module `__ +to control whether a warning is ignored or raised at different places within +a single test. + +.. code-block:: python + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + +Testing an exception +^^^^^^^^^^^^^^^^^^^^ + +Use `pytest.raises `_ as a context manager +with the specific exception subclass (i.e. never use :py:class:`Exception`) and the exception message in ``match``. + +.. code-block:: python + + with pytest.raises(ValueError, match="an error"): + raise ValueError("an error") + +Testing involving files +^^^^^^^^^^^^^^^^^^^^^^^ + +The ``tm.ensure_clean`` context manager creates a temporary file for testing, +with a generated filename (or your filename if provided), that is automatically +deleted when the context block is exited. + +.. code-block:: python + + with tm.ensure_clean('my_file_path') as path: + # do something with the path + +Testing involving network connectivity +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It is highly discouraged to add a test that connects to the internet due to flakiness of network connections and +lack of ownership of the server that is being connected to. If network connectivity is absolutely required, use the +``tm.network`` decorator. + +.. code-block:: python + + @tm.network # noqa + def test_network(): + result = package.call_to_internet() + +If the test requires data from a specific website, specify ``check_before_test=True`` and the site in the decorator. + +.. code-block:: python + + @tm.network("https://www.somespecificsite.com/", check_before_test=True) + def test_network(): + result = pd.read_html("https://www.somespecificsite.com/") + +Example +^^^^^^^ + +Here is an example of a self-contained set of tests in a file ``pandas/tests/test_cool_feature.py`` +that illustrate multiple features that we like to use. Please remember to add the GitHub Issue Number +as a comment to a new test. + +.. code-block:: python + + import pytest + import numpy as np + import pandas as pd + + + @pytest.mark.parametrize('dtype', ['int8', 'int16', 'int32', 'int64']) + def test_dtypes(dtype): + assert str(np.dtype(dtype)) == dtype + + + @pytest.mark.parametrize( + 'dtype', ['float32', pytest.param('int16', marks=pytest.mark.skip), + pytest.param('int32', marks=pytest.mark.xfail( + reason='to show how it works'))]) + def test_mark(dtype): + assert str(np.dtype(dtype)) == 'float32' + + + @pytest.fixture + def series(): + return pd.Series([1, 2, 3]) + + + @pytest.fixture(params=['int8', 'int16', 'int32', 'int64']) + def dtype(request): + return request.param + + + def test_series(series, dtype): + # GH + result = series.astype(dtype) + assert result.dtype == dtype + + expected = pd.Series([1, 2, 3], dtype=dtype) + tm.assert_series_equal(result, expected) + + +A test run of this yields + +.. code-block:: shell + + ((pandas) bash-3.2$ pytest test_cool_feature.py -v + =========================== test session starts =========================== + platform darwin -- Python 3.6.2, pytest-3.6.0, py-1.4.31, pluggy-0.4.0 + collected 11 items + + tester.py::test_dtypes[int8] PASSED + tester.py::test_dtypes[int16] PASSED + tester.py::test_dtypes[int32] PASSED + tester.py::test_dtypes[int64] PASSED + tester.py::test_mark[float32] PASSED + tester.py::test_mark[int16] SKIPPED + tester.py::test_mark[int32] xfail + tester.py::test_series[int8] PASSED + tester.py::test_series[int16] PASSED + tester.py::test_series[int32] PASSED + tester.py::test_series[int64] PASSED + +Tests that we have ``parametrized`` are now accessible via the test name, for example we could run these with ``-k int8`` to sub-select *only* those tests which match ``int8``. + + +.. code-block:: shell + + ((pandas) bash-3.2$ pytest test_cool_feature.py -v -k int8 + =========================== test session starts =========================== + platform darwin -- Python 3.6.2, pytest-3.6.0, py-1.4.31, pluggy-0.4.0 + collected 11 items + + test_cool_feature.py::test_dtypes[int8] PASSED + test_cool_feature.py::test_series[int8] PASSED + + +.. _using-hypothesis: + +Using ``hypothesis`` +~~~~~~~~~~~~~~~~~~~~ + +Hypothesis is a library for property-based testing. Instead of explicitly +parametrizing a test, you can describe *all* valid inputs and let Hypothesis +try to find a failing input. Even better, no matter how many random examples +it tries, Hypothesis always reports a single minimal counterexample to your +assertions - often an example that you would never have thought to test. + +See `Getting Started with Hypothesis `_ +for more of an introduction, then `refer to the Hypothesis documentation +for details `_. + +.. code-block:: python + + import json + from hypothesis import given, strategies as st + + any_json_value = st.deferred(lambda: st.one_of( + st.none(), st.booleans(), st.floats(allow_nan=False), st.text(), + st.lists(any_json_value), st.dictionaries(st.text(), any_json_value) + )) + + + @given(value=any_json_value) + def test_json_roundtrip(value): + result = json.loads(json.dumps(value)) + assert value == result + +This test shows off several useful features of Hypothesis, as well as +demonstrating a good use-case: checking properties that should hold over +a large or complicated domain of inputs. + +To keep the pandas test suite running quickly, parametrized tests are +preferred if the inputs or logic are simple, with Hypothesis tests reserved +for cases with complex logic or where there are too many combinations of +options or subtle interactions to test (or think of!) all of them. + +.. _contributing.running_tests: + +Running the test suite +---------------------- + +The tests can then be run directly inside your Git clone (without having to +install pandas) by typing:: + + pytest pandas + +.. note:: + + If a handful of tests don't pass, it may not be an issue with your pandas installation. + Some tests (e.g. some SQLAlchemy ones) require additional setup, others might start + failing because a non-pinned library released a new version, and others might be flaky + if run in parallel. As long as you can import pandas from your locally built version, + your installation is probably fine and you can start contributing! + +Often it is worth running only a subset of tests first around your changes before running the +entire suite. + +The easiest way to do this is with:: + + pytest pandas/path/to/test.py -k regex_matching_test_name + +Or with one of the following constructs:: + + pytest pandas/tests/[test-module].py + pytest pandas/tests/[test-module].py::[TestClass] + pytest pandas/tests/[test-module].py::[TestClass]::[test_method] + +Using `pytest-xdist `_, which is +included in our 'pandas-dev' environment, one can speed up local testing on +multicore machines. The ``-n`` number flag then can be specified when running +pytest to parallelize a test run across the number of specified cores or auto to +utilize all the available cores on your machine. + +.. code-block:: bash + + # Utilize 4 cores + pytest -n 4 pandas + + # Utilizes all available cores + pytest -n auto pandas + +If you'd like to speed things along further a more advanced use of this +command would look like this + +.. code-block:: bash + + pytest pandas -n 4 -m "not slow and not network and not db and not single_cpu" -r sxX + +In addition to the multithreaded performance increase this improves test +speed by skipping some tests using the ``-m`` mark flag: + +- slow: any test taking long (think seconds rather than milliseconds) +- network: tests requiring network connectivity +- db: tests requiring a database (mysql or postgres) +- single_cpu: tests that should run on a single cpu only + +You might want to enable the following option if it's relevant for you: + +- arm_slow: any test taking long on arm64 architecture + +These markers are defined `in this toml file `_ +, under ``[tool.pytest.ini_options]`` in a list called ``markers``, in case +you want to check if new ones have been created which are of interest to you. + +The ``-r`` report flag will display a short summary info (see `pytest +documentation `_) +. Here we are displaying the number of: + +- s: skipped tests +- x: xfailed tests +- X: xpassed tests + +The summary is optional and can be removed if you don't need the added +information. Using the parallelization option can significantly reduce the +time it takes to locally run tests before submitting a pull request. + +If you require assistance with the results, +which has happened in the past, please set a seed before running the command +and opening a bug report, that way we can reproduce it. Here's an example +for setting a seed on windows + +.. code-block:: bash + + set PYTHONHASHSEED=314159265 + pytest pandas -n 4 -m "not slow and not network and not db and not single_cpu" -r sxX + +On Unix use + +.. code-block:: bash + + export PYTHONHASHSEED=314159265 + pytest pandas -n 4 -m "not slow and not network and not db and not single_cpu" -r sxX + +For more, see the `pytest `_ documentation. + +Furthermore one can run + +.. code-block:: python + + pd.test() + +with an imported pandas to run tests similarly. + +Running the performance test suite +---------------------------------- + +Performance matters and it is worth considering whether your code has introduced +performance regressions. pandas is in the process of migrating to +`asv benchmarks `__ +to enable easy monitoring of the performance of critical pandas operations. +These benchmarks are all found in the ``pandas/asv_bench`` directory, and the +test results can be found `here `__. + +To use all features of asv, you will need either ``conda`` or +``virtualenv``. For more details please check the `asv installation +webpage `_. + +To install asv:: + + pip install git+https://github.com/airspeed-velocity/asv + +If you need to run a benchmark, change your directory to ``asv_bench/`` and run:: + + asv continuous -f 1.1 upstream/main HEAD + +You can replace ``HEAD`` with the name of the branch you are working on, +and report benchmarks that changed by more than 10%. +The command uses ``conda`` by default for creating the benchmark +environments. If you want to use virtualenv instead, write:: + + asv continuous -f 1.1 -E virtualenv upstream/main HEAD + +The ``-E virtualenv`` option should be added to all ``asv`` commands +that run benchmarks. The default value is defined in ``asv.conf.json``. + +Running the full benchmark suite can be an all-day process, depending on your +hardware and its resource utilization. However, usually it is sufficient to paste +only a subset of the results into the pull request to show that the committed changes +do not cause unexpected performance regressions. You can run specific benchmarks +using the ``-b`` flag, which takes a regular expression. For example, this will +only run benchmarks from a ``pandas/asv_bench/benchmarks/groupby.py`` file:: + + asv continuous -f 1.1 upstream/main HEAD -b ^groupby + +If you want to only run a specific group of benchmarks from a file, you can do it +using ``.`` as a separator. For example:: + + asv continuous -f 1.1 upstream/main HEAD -b groupby.GroupByMethods + +will only run the ``GroupByMethods`` benchmark defined in ``groupby.py``. + +You can also run the benchmark suite using the version of ``pandas`` +already installed in your current Python environment. This can be +useful if you do not have virtualenv or conda, or are using the +``setup.py develop`` approach discussed above; for the in-place build +you need to set ``PYTHONPATH``, e.g. +``PYTHONPATH="$PWD/.." asv [remaining arguments]``. +You can run benchmarks using an existing Python +environment by:: + + asv run -e -E existing + +or, to use a specific Python interpreter,:: + + asv run -e -E existing:python3.6 + +This will display stderr from the benchmarks, and use your local +``python`` that comes from your ``$PATH``. + +Information on how to write a benchmark and how to use asv can be found in the +`asv documentation `_. + +Documenting your code +--------------------- + +Changes should be reflected in the release notes located in ``doc/source/whatsnew/vx.y.z.rst``. +This file contains an ongoing change log for each release. Add an entry to this file to +document your fix, enhancement or (unavoidable) breaking change. Make sure to include the +GitHub issue number when adding your entry (using ``:issue:`1234``` where ``1234`` is the +issue/pull request number). Your entry should be written using full sentences and proper +grammar. + +When mentioning parts of the API, use a Sphinx ``:func:``, ``:meth:``, or ``:class:`` +directive as appropriate. Not all public API functions and methods have a +documentation page; ideally links would only be added if they resolve. You can +usually find similar examples by checking the release notes for one of the previous +versions. + +If your code is a bugfix, add your entry to the relevant bugfix section. Avoid +adding to the ``Other`` section; only in rare cases should entries go there. +Being as concise as possible, the description of the bug should include how the +user may encounter it and an indication of the bug itself, e.g. +"produces incorrect results" or "incorrectly raises". It may be necessary to also +indicate the new behavior. + +If your code is an enhancement, it is most likely necessary to add usage +examples to the existing documentation. This can be done following the section +regarding :ref:`documentation `. +Further, to let users know when this feature was added, the ``versionadded`` +directive is used. The sphinx syntax for that is: + +.. code-block:: rst + + .. versionadded:: 1.1.0 + +This will put the text *New in version 1.1.0* wherever you put the sphinx +directive. This should also be put in the docstring when adding a new function +or method (`example `__) +or a new keyword argument (`example `__). diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 0c780ad5f5847..6524e4da2299d 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -25,25 +25,25 @@ The next example gives an idea of what a docstring looks like: """ Add up two integer numbers. - This function simply wraps the `+` operator, and does not + This function simply wraps the ``+`` operator, and does not do anything interesting, except for illustrating what the docstring of a very simple function looks like. Parameters ---------- num1 : int - First number to add + First number to add. num2 : int - Second number to add + Second number to add. Returns ------- int - The sum of `num1` and `num2` + The sum of ``num1`` and ``num2``. See Also -------- - subtract : Subtract one integer from another + subtract : Subtract one integer from another. Examples -------- @@ -63,14 +63,12 @@ The first conventions every Python docstring should follow are defined in `PEP-257 `_. As PEP-257 is quite broad, other more specific standards also exist. In the -case of pandas, the numpy docstring convention is followed. These conventions are +case of pandas, the NumPy docstring convention is followed. These conventions are explained in this document: * `numpydoc docstring guide `_ - (which is based in the original `Guide to NumPy/SciPy documentation - `_) -numpydoc is a Sphinx extension to support the numpy docstring convention. +numpydoc is a Sphinx extension to support the NumPy docstring convention. The standard uses reStructuredText (reST). reStructuredText is a markup language that allows encoding styles in plain text files. Documentation @@ -126,9 +124,9 @@ backticks. The following are considered inline code: def add_values(arr): """ - Add the values in `arr`. + Add the values in ``arr``. - This is equivalent to Python `sum` of :meth:`pandas.Series.sum`. + This is equivalent to Python ``sum`` of :meth:`pandas.Series.sum`. Some sections are omitted here for simplicity. """ @@ -144,13 +142,13 @@ backticks. The following are considered inline code: With several mistakes in the docstring. - It has a blank like after the signature `def func():`. + It has a blank like after the signature ``def func():``. The text 'Some function' should go in the line after the opening quotes of the docstring, not in the same line. There is a blank line between the docstring and the first line - of code `foo = 1`. + of code ``foo = 1``. The closing quotes should be in the next line, not in this one.""" @@ -269,11 +267,11 @@ after, and not between the line with the word "Parameters" and the one with the hyphens. After the title, each parameter in the signature must be documented, including -`*args` and `**kwargs`, but not `self`. +``*args`` and ``**kwargs``, but not ``self``. The parameters are defined by their name, followed by a space, a colon, another space, and the type (or types). Note that the space between the name and the -colon is important. Types are not defined for `*args` and `**kwargs`, but must +colon is important. Types are not defined for ``*args`` and ``**kwargs``, but must be defined for all other parameters. After the parameter definition, it is required to have a line with the parameter description, which is indented, and can have multiple lines. The description must start with a capital letter, and @@ -285,13 +283,13 @@ comma at the end of the type. The exact form of the type in this case will be argument means, which can be added after a comma "int, default -1, meaning all cpus". -In cases where the default value is `None`, meaning that the value will not be -used. Instead of "str, default None", it is preferred to write "str, optional". -When `None` is a value being used, we will keep the form "str, default None". -For example, in `df.to_csv(compression=None)`, `None` is not a value being used, +In cases where the default value is ``None``, meaning that the value will not be +used. Instead of ``"str, default None"``, it is preferred to write ``"str, optional"``. +When ``None`` is a value being used, we will keep the form "str, default None". +For example, in ``df.to_csv(compression=None)``, ``None`` is not a value being used, but means that compression is optional, and no compression is being used if not -provided. In this case we will use `str, optional`. Only in cases like -`func(value=None)` and `None` is being used in the same way as `0` or `foo` +provided. In this case we will use ``"str, optional"``. Only in cases like +``func(value=None)`` and ``None`` is being used in the same way as ``0`` or ``foo`` would be used, then we will specify "str, int or None, default None". **Good:** @@ -331,13 +329,13 @@ would be used, then we will specify "str, int or None, default None". specified kind. Note the blank line between the parameters title and the first - parameter. Also, note that after the name of the parameter `kind` + parameter. Also, note that after the name of the parameter ``kind`` and before the colon, a space is missing. Also, note that the parameter descriptions do not start with a capital letter, and do not finish with a dot. - Finally, the `**kwargs` parameter is missing. + Finally, the ``**kwargs`` parameter is missing. Parameters ---------- @@ -361,9 +359,9 @@ boolean, etc): * str * bool -For complex types, define the subtypes. For `dict` and `tuple`, as more than +For complex types, define the subtypes. For ``dict`` and ``tuple``, as more than one type is present, we use the brackets to help read the type (curly brackets -for `dict` and normal brackets for `tuple`): +for ``dict`` and normal brackets for ``tuple``): * list of int * dict of {str : int} @@ -401,7 +399,7 @@ DataFrame: * pandas.Categorical * pandas.arrays.SparseArray -If the exact type is not relevant, but must be compatible with a numpy +If the exact type is not relevant, but must be compatible with a NumPy array, array-like can be specified. If Any type that can be iterated is accepted, iterable can be used: @@ -512,8 +510,8 @@ This section is used to let users know about pandas functionality related to the one being documented. In rare cases, if no related methods or functions can be found at all, this section can be skipped. -An obvious example would be the `head()` and `tail()` methods. As `tail()` does -the equivalent as `head()` but at the end of the `Series` or `DataFrame` +An obvious example would be the ``head()`` and ``tail()`` methods. As ``tail()`` does +the equivalent as ``head()`` but at the end of the ``Series`` or ``DataFrame`` instead of at the beginning, it is good to let the users know about it. To give an intuition on what can be considered related, here there are some @@ -608,8 +606,8 @@ Examples in docstrings, besides illustrating the usage of the function or method, must be valid Python code, that returns the given output in a deterministic way, and that can be copied and run by users. -Examples are presented as a session in the Python terminal. `>>>` is used to -present code. `...` is used for code continuing from the previous line. +Examples are presented as a session in the Python terminal. ``>>>`` is used to +present code. ``...`` is used for code continuing from the previous line. Output is presented immediately after the last line of code generating the output (no blank lines in between). Comments describing the examples can be added with blank lines before and after them. @@ -664,7 +662,7 @@ A simple example could be: 4 Falcon dtype: object - With the `n` parameter, we can change the number of returned rows: + With the ``n`` parameter, we can change the number of returned rows: >>> s.head(n=3) 0 Ant @@ -742,7 +740,7 @@ positional arguments ``head(3)``. def fillna(self, value): """ - Replace missing values by `value`. + Replace missing values by ``value``. Examples -------- @@ -771,7 +769,7 @@ positional arguments ``head(3)``. def contains(self, pattern, case_sensitive=True, na=numpy.nan): """ - Return whether each value contains `pattern`. + Return whether each value contains ``pattern``. In this case, we are illustrating how to use sections, even if the example is simple enough and does not require them. @@ -788,8 +786,8 @@ positional arguments ``head(3)``. **Case sensitivity** - With `case_sensitive` set to `False` we can match `a` with both - `a` and `A`: + With ``case_sensitive`` set to ``False`` we can match ``a`` with both + ``a`` and ``A``: >>> s.contains(pattern='a', case_sensitive=False) 0 True @@ -800,7 +798,7 @@ positional arguments ``head(3)``. **Missing values** - We can fill missing values in the output using the `na` parameter: + We can fill missing values in the output using the ``na`` parameter: >>> s.contains(pattern='a', na=False) 0 False @@ -819,14 +817,14 @@ positional arguments ``head(3)``. """ A sample DataFrame method. - Do not import numpy and pandas. + Do not import NumPy and pandas. Try to use meaningful data, when it makes the example easier to understand. - Try to avoid positional arguments like in `df.method(1)`. They + Try to avoid positional arguments like in ``df.method(1)``. They can be all right if previously defined with a meaningful name, - like in `present_value(interest_rate)`, but avoid them otherwise. + like in ``present_value(interest_rate)``, but avoid them otherwise. When presenting the behavior with different parameters, do not place all the calls one next to the other. Instead, add a short sentence @@ -854,7 +852,7 @@ Tips for getting your examples pass the doctests Getting the examples pass the doctests in the validation script can sometimes be tricky. Here are some attention points: -* Import all needed libraries (except for pandas and numpy, those are already +* Import all needed libraries (except for pandas and NumPy, those are already imported as ``import pandas as pd`` and ``import numpy as np``) and define all variables you use in the example. @@ -914,7 +912,7 @@ plot will be generated automatically when building the documentation. class Series: def plot(self): """ - Generate a plot with the `Series` data. + Generate a plot with the ``Series`` data. Examples -------- @@ -998,4 +996,4 @@ mapping function names to docstrings. Wherever possible, we prefer using See ``pandas.core.generic.NDFrame.fillna`` for an example template, and ``pandas.core.series.Series.fillna`` and ``pandas.core.generic.frame.fillna`` -for the filled versions. \ No newline at end of file +for the filled versions. diff --git a/doc/source/development/contributing_documentation.rst b/doc/source/development/contributing_documentation.rst new file mode 100644 index 0000000000000..f064c5eb07a9a --- /dev/null +++ b/doc/source/development/contributing_documentation.rst @@ -0,0 +1,217 @@ +.. _contributing_documentation: + +{{ header }} + +================================= +Contributing to the documentation +================================= + +Contributing to the documentation benefits everyone who uses pandas. +We encourage you to help us improve the documentation, and +you don't have to be an expert on pandas to do so! In fact, +there are sections of the docs that are worse off after being written by +experts. If something in the docs doesn't make sense to you, updating the +relevant section after you figure it out is a great way to ensure it will help +the next person. Please visit the `issues page `__ +for a full list of issues that are currently open regarding the +Pandas documentation. + + + +.. contents:: Documentation: + :local: + + +About the pandas documentation +-------------------------------- + +The documentation is written in **reStructuredText**, which is almost like writing +in plain English, and built using `Sphinx `__. The +Sphinx Documentation has an excellent `introduction to reST +`__. Review the Sphinx docs to perform more +complex changes to the documentation as well. + +Some other important things to know about the docs: + +* The pandas documentation consists of two parts: the docstrings in the code + itself and the docs in this folder ``doc/``. + + The docstrings provide a clear explanation of the usage of the individual + functions, while the documentation in this folder consists of tutorial-like + overviews per topic together with some other information (what's new, + installation, etc). + +* The docstrings follow a pandas convention, based on the **Numpy Docstring + Standard**. Follow the :ref:`pandas docstring guide ` for detailed + instructions on how to write a correct docstring. + + .. toctree:: + :maxdepth: 2 + + contributing_docstring.rst + +* The tutorials make heavy use of the `IPython directive + `_ sphinx extension. + This directive lets you put code in the documentation which will be run + during the doc build. For example:: + + .. ipython:: python + + x = 2 + x**3 + + will be rendered as:: + + In [1]: x = 2 + + In [2]: x**3 + Out[2]: 8 + + Almost all code examples in the docs are run (and the output saved) during the + doc build. This approach means that code examples will always be up to date, + but it does make the doc building a bit more complex. + +* Our API documentation files in ``doc/source/reference`` house the auto-generated + documentation from the docstrings. For classes, there are a few subtleties + around controlling which methods and attributes have pages auto-generated. + + We have two autosummary templates for classes. + + 1. ``_templates/autosummary/class.rst``. Use this when you want to + automatically generate a page for every public method and attribute on the + class. The ``Attributes`` and ``Methods`` sections will be automatically + added to the class' rendered documentation by numpydoc. See ``DataFrame`` + for an example. + + 2. ``_templates/autosummary/class_without_autosummary``. Use this when you + want to pick a subset of methods / attributes to auto-generate pages for. + When using this template, you should include an ``Attributes`` and + ``Methods`` section in the class docstring. See ``CategoricalIndex`` for an + example. + + Every method should be included in a ``toctree`` in one of the documentation files in + ``doc/source/reference``, else Sphinx + will emit a warning. + +The utility script ``scripts/validate_docstrings.py`` can be used to get a csv +summary of the API documentation. And also validate common errors in the docstring +of a specific class, function or method. The summary also compares the list of +methods documented in the files in ``doc/source/reference`` (which is used to generate +the `API Reference `_ page) +and the actual public methods. +This will identify methods documented in ``doc/source/reference`` that are not actually +class methods, and existing methods that are not documented in ``doc/source/reference``. + + +Updating a pandas docstring +----------------------------- + +When improving a single function or method's docstring, it is not necessarily +needed to build the full documentation (see next section). +However, there is a script that checks a docstring (for example for the ``DataFrame.mean`` method):: + + python scripts/validate_docstrings.py pandas.DataFrame.mean + +This script will indicate some formatting errors if present, and will also +run and test the examples included in the docstring. +Check the :ref:`pandas docstring guide ` for a detailed guide +on how to format the docstring. + +The examples in the docstring ('doctests') must be valid Python code, +that in a deterministic way returns the presented output, and that can be +copied and run by users. This can be checked with the script above, and is +also tested on Travis. A failing doctest will be a blocker for merging a PR. +Check the :ref:`examples ` section in the docstring guide +for some tips and tricks to get the doctests passing. + +When doing a PR with a docstring update, it is good to post the +output of the validation script in a comment on github. + +.. _contributing.howto-build-docs: + +How to build the pandas documentation +--------------------------------------- + +Requirements +~~~~~~~~~~~~ + +First, you need to have a development environment to be able to build pandas +(see the docs on :ref:`creating a development environment `). + +Building the documentation +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +So how do you build the docs? Navigate to your local +``doc/`` directory in the console and run:: + + python make.py html + +Then you can find the HTML output in the folder ``doc/build/html/``. + +The first time you build the docs, it will take quite a while because it has to run +all the code examples and build all the generated docstring pages. In subsequent +evocations, sphinx will try to only build the pages that have been modified. + +If you want to do a full clean build, do:: + + python make.py clean + python make.py html + +You can tell ``make.py`` to compile only a single section of the docs, greatly +reducing the turn-around time for checking your changes. + +:: + + # omit autosummary and API section + python make.py clean + python make.py --no-api + + # compile the docs with only a single section, relative to the "source" folder. + # For example, compiling only this guide (doc/source/development/contributing.rst) + python make.py clean + python make.py --single development/contributing.rst + + # compile the reference docs for a single function + python make.py clean + python make.py --single pandas.DataFrame.join + + # compile whatsnew and API section (to resolve links in the whatsnew) + python make.py clean + python make.py --whatsnew + +For comparison, a full documentation build may take 15 minutes, but a single +section may take 15 seconds. Subsequent builds, which only process portions +you have changed, will be faster. + +The build will automatically use the number of cores available on your machine +to speed up the documentation build. You can override this:: + + python make.py html --num-jobs 4 + +Open the following file in a web browser to see the full documentation you +just built:: + + doc/build/html/index.html + +And you'll have the satisfaction of seeing your new and improved documentation! + +.. _contributing.dev_docs: + +Building main branch documentation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When pull requests are merged into the pandas ``main`` branch, the main parts of +the documentation are also built by Travis-CI. These docs are then hosted `here +`__, see also +the :any:`Continuous Integration ` section. + +Previewing changes +------------------ + +Once, the pull request is submitted, GitHub Actions will automatically build the +documentation. To view the built site: + +#. Wait for the ``CI / Web and docs`` check to complete. +#. Click ``Details`` next to it. +#. From the ``Artifacts`` drop-down, click ``docs`` or ``website`` to download + the site as a ZIP file. diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst new file mode 100644 index 0000000000000..a648b18a554ee --- /dev/null +++ b/doc/source/development/contributing_environment.rst @@ -0,0 +1,228 @@ +.. _contributing_environment: + +{{ header }} + +================================== +Creating a development environment +================================== + +To test out code changes, you'll need to build pandas from source, which +requires a C/C++ compiler and Python environment. If you're making documentation +changes, you can skip to :ref:`contributing to the documentation ` but if you skip +creating the development environment you won't be able to build the documentation +locally before pushing your changes. It's recommended to also install the :ref:`pre-commit hooks `. + +.. toctree:: + :maxdepth: 2 + :hidden: + + contributing_gitpod.rst + +Step 1: install a C compiler +---------------------------- + +How to do this will depend on your platform. If you choose to user ``Docker`` +in the next step, then you can skip this step. + +**Windows** + +You will need `Build Tools for Visual Studio 2022 +`_. + +.. note:: + You DO NOT need to install Visual Studio 2022. + You only need "Build Tools for Visual Studio 2022" found by + scrolling down to "All downloads" -> "Tools for Visual Studio". + In the installer, select the "Desktop development with C++" Workloads. + +Alternatively, you can install the necessary components on the commandline using +`vs_BuildTools.exe `_ + +Alternatively, you could use the `WSL `_ +and consult the ``Linux`` instructions below. + +**macOS** + +To use the :ref:`mamba `-based compilers, you will need to install the +Developer Tools using ``xcode-select --install``. Otherwise +information about compiler installation can be found here: +https://devguide.python.org/setup/#macos + +**Linux** + +For Linux-based :ref:`mamba ` installations, you won't have to install any +additional components outside of the mamba environment. The instructions +below are only needed if your setup isn't based on mamba environments. + +Some Linux distributions will come with a pre-installed C compiler. To find out +which compilers (and versions) are installed on your system:: + + # for Debian/Ubuntu: + dpkg --list | grep compiler + # for Red Hat/RHEL/CentOS/Fedora: + yum list installed | grep -i --color compiler + +`GCC (GNU Compiler Collection) `_, is a widely used +compiler, which supports C and a number of other languages. If GCC is listed +as an installed compiler nothing more is required. + +If no C compiler is installed, or you wish to upgrade, or you're using a different +Linux distribution, consult your favorite search engine for compiler installation/update +instructions. + +Let us know if you have any difficulties by opening an issue or reaching out on our contributor +community :ref:`Slack `. + +Step 2: create an isolated environment +---------------------------------------- + +Before we begin, please: + +* Make sure that you have :any:`cloned the repository ` +* ``cd`` to the pandas source directory you just created with the clone command + +.. _contributing.mamba: + +Option 1: using mamba (recommended) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* Install `mamba `_ +* Make sure your mamba is up to date (``mamba update mamba``) + +.. code-block:: none + + # Create and activate the build environment + mamba env create --file environment.yml + mamba activate pandas-dev + +.. _contributing.pip: + +Option 2: using pip +~~~~~~~~~~~~~~~~~~~ + +You'll need to have at least the :ref:`minimum Python version ` that pandas supports. +You also need to have ``setuptools`` 51.0.0 or later to build pandas. + +**Unix**/**macOS with virtualenv** + +.. code-block:: bash + + # Create a virtual environment + # Use an ENV_DIR of your choice. We'll use ~/virtualenvs/pandas-dev + # Any parent directories should already exist + python3 -m venv ~/virtualenvs/pandas-dev + + # Activate the virtualenv + . ~/virtualenvs/pandas-dev/bin/activate + + # Install the build dependencies + python -m pip install -r requirements-dev.txt + +**Unix**/**macOS with pyenv** + +Consult the docs for setting up pyenv `here `__. + +.. code-block:: bash + + # Create a virtual environment + # Use an ENV_DIR of your choice. We'll use ~/Users//.pyenv/versions/pandas-dev + pyenv virtualenv + + # For instance: + pyenv virtualenv 3.9.10 pandas-dev + + # Activate the virtualenv + pyenv activate pandas-dev + + # Now install the build dependencies in the cloned pandas repo + python -m pip install -r requirements-dev.txt + +**Windows** + +Below is a brief overview on how to set-up a virtual environment with Powershell +under Windows. For details please refer to the +`official virtualenv user guide `__. + +Use an ENV_DIR of your choice. We'll use ``~\\virtualenvs\\pandas-dev`` where +``~`` is the folder pointed to by either ``$env:USERPROFILE`` (Powershell) or +``%USERPROFILE%`` (cmd.exe) environment variable. Any parent directories +should already exist. + +.. code-block:: powershell + + # Create a virtual environment + python -m venv $env:USERPROFILE\virtualenvs\pandas-dev + + # Activate the virtualenv. Use activate.bat for cmd.exe + ~\virtualenvs\pandas-dev\Scripts\Activate.ps1 + + # Install the build dependencies + python -m pip install -r requirements-dev.txt + +Option 3: using Docker +~~~~~~~~~~~~~~~~~~~~~~ + +pandas provides a ``DockerFile`` in the root directory to build a Docker image +with a full pandas development environment. + +**Docker Commands** + +Build the Docker image:: + + # Build the image + docker build -t pandas-dev . + +Run Container:: + + # Run a container and bind your local repo to the container + # This command assumes you are running from your local repo + # but if not alter ${PWD} to match your local repo path + docker run -it --rm -v ${PWD}:/home/pandas pandas-dev + +*Even easier, you can integrate Docker with the following IDEs:* + +**Visual Studio Code** + +You can use the DockerFile to launch a remote session with Visual Studio Code, +a popular free IDE, using the ``.devcontainer.json`` file. +See https://code.visualstudio.com/docs/remote/containers for details. + +**PyCharm (Professional)** + +Enable Docker support and use the Services tool window to build and manage images as well as +run and interact with containers. +See https://www.jetbrains.com/help/pycharm/docker.html for details. + +Option 4: using Gitpod +~~~~~~~~~~~~~~~~~~~~~~ + +Gitpod is an open-source platform that automatically creates the correct development +environment right in your browser, reducing the need to install local development +environments and deal with incompatible dependencies. + +If you are a Windows user, unfamiliar with using the command line or building pandas +for the first time, it is often faster to build with Gitpod. Here are the in-depth instructions +for :ref:`building pandas with GitPod `. + +Step 3: build and install pandas +-------------------------------- + +You can now run:: + + # Build and install pandas + python setup.py build_ext -j 4 + python -m pip install -e . --no-build-isolation --no-use-pep517 + +At this point you should be able to import pandas from your locally built version:: + + $ python + >>> import pandas + >>> print(pandas.__version__) # note: the exact output may differ + 2.0.0.dev0+880.g2b9e661fbb.dirty + +This will create the new environment, and not touch any of your existing environments, +nor any existing Python installation. + +.. note:: + You will need to repeat this step each time the C extensions change, for example + if you modified any file in ``pandas/_libs`` or if you did a fetch and merge from ``upstream/main``. diff --git a/doc/source/development/contributing_gitpod.rst b/doc/source/development/contributing_gitpod.rst new file mode 100644 index 0000000000000..c591be5425db9 --- /dev/null +++ b/doc/source/development/contributing_gitpod.rst @@ -0,0 +1,273 @@ +.. _contributing-gitpod: + +Using Gitpod for pandas development +=================================== + +This section of the documentation will guide you through: + +* using Gitpod for your pandas development environment +* creating a personal fork of the pandas repository on GitHub +* a quick tour of pandas and VSCode +* working on the pandas documentation in Gitpod + +Gitpod +------ + +`Gitpod`_ is an open-source platform for automated and ready-to-code +development environments. It enables developers to describe their dev +environment as code and start instant and fresh development environments for +each new task directly from your browser. This reduces the need to install local +development environments and deal with incompatible dependencies. + + +Gitpod GitHub integration +------------------------- + +To be able to use Gitpod, you will need to have the Gitpod app installed on your +GitHub account, so if +you do not have an account yet, you will need to create one first. + +To get started just login at `Gitpod`_, and grant the appropriate permissions to GitHub. + +We have built a python 3.8 environment and all development dependencies will +install when the environment starts. + + +Forking the pandas repository +----------------------------- + +The best way to work on pandas as a contributor is by making a fork of the +repository first. + +#. Browse to the `pandas repository on GitHub`_ and `create your own fork`_. + +#. Browse to your fork. Your fork will have a URL like + https://github.com/noatamir/pandas-dev, except with your GitHub username in place of + ``noatamir``. + +Starting Gitpod +--------------- +Once you have authenticated to Gitpod through GitHub, you can install the +`Gitpod Chromium or Firefox browser extension `_ +which will add a **Gitpod** button next to the **Code** button in the +repository: + +.. image:: ./gitpod-imgs/pandas-github.png + :alt: pandas repository with Gitpod button screenshot + +#. If you install the extension - you can click the **Gitpod** button to start + a new workspace. + +#. Alternatively, if you do not want to install the browser extension, you can + visit https://gitpod.io/#https://github.com/USERNAME/pandas replacing + ``USERNAME`` with your GitHub username. + +#. In both cases, this will open a new tab on your web browser and start + building your development environment. Please note this can take a few + minutes. + +#. Once the build is complete, you will be directed to your workspace, + including the VSCode editor and all the dependencies you need to work on + pandas. The first time you start your workspace, you will notice that there + might be some actions running. This will ensure that you have a development + version of pandas installed. + +#. When your workspace is ready, you can :ref:`test the build` by + entering:: + + $ python -m pytest pandas + + +Quick workspace tour +-------------------- +Gitpod uses VSCode as the editor. If you have not used this editor before, you +can check the Getting started `VSCode docs`_ to familiarize yourself with it. + +Your workspace will look similar to the image below: + +.. image:: ./gitpod-imgs/gitpod-workspace.png + :alt: Gitpod workspace screenshot + +We have marked some important sections in the editor: + +#. Your current Python interpreter - by default, this is ``pandas-dev`` and + should be displayed in the status bar and on your terminal. You do not need + to activate the conda environment as this will always be activated for you. +#. Your current branch is always displayed in the status bar. You can also use + this button to change or create branches. +#. GitHub Pull Requests extension - you can use this to work with Pull Requests + from your workspace. +#. Marketplace extensions - we have added some essential extensions to the pandas + Gitpod. Still, you can also install other extensions or syntax highlighting + themes for your user, and these will be preserved for you. +#. Your workspace directory - by default, it is ``/workspace/pandas-dev``. **Do not + change this** as this is the only directory preserved in Gitpod. + +We have also pre-installed a few tools and VSCode extensions to help with the +development experience: + +* `VSCode rst extension `_ +* `Markdown All in One `_ +* `VSCode Gitlens extension `_ +* `VSCode Git Graph extension `_ + +Development workflow with Gitpod +-------------------------------- +The :ref:`contributing` section of this documentation contains +information regarding the pandas development workflow. Make sure to check this +before working on your contributions. + +When using Gitpod, git is pre configured for you: + +#. You do not need to configure your git username, and email as this should be + done for you as you authenticated through GitHub. Unless you are using GitHub + feature to keep email address private. You can check the git + configuration with the command ``git config --list`` in your terminal. Use + ``git config --global user.email “your-secret-email@users.noreply.github.com”`` + to set your email address to the one you use to make commits with your github + profile. +#. As you started your workspace from your own pandas fork, you will by default + have both ``upstream`` and ``origin`` added as remotes. You can verify this by + typing ``git remote`` on your terminal or by clicking on the **branch name** + on the status bar (see image below). + + .. image:: ./gitpod-imgs/pandas-gitpod-branches.png + :alt: Gitpod workspace branches plugin screenshot + +Rendering the pandas documentation +---------------------------------- +You can find the detailed documentation on how rendering the documentation with +Sphinx works in the :ref:`contributing.howto-build-docs` section. To build the full +docs you need to run the following command in the docs directory:: + + $ cd docs + $ python make.py html + +Alternatively you can build a single page with:: + + python make.py --single development/contributing_gitpod.rst + +You have two main options to render the documentation in Gitpod. + +Option 1: using Liveserve +~~~~~~~~~~~~~~~~~~~~~~~~~ + +#. View the documentation in ``pandas/doc/build/html``. +#. To see the rendered version of a page, you can right-click on the ``.html`` + file and click on **Open with Live Serve**. Alternatively, you can open the + file in the editor and click on the **Go live** button on the status bar. + + .. image:: ./gitpod-imgs/vscode-statusbar.png + :alt: Gitpod workspace VSCode start live serve screenshot + +#. A simple browser will open to the right-hand side of the editor. We recommend + closing it and click on the **Open in browser** button in the pop-up. +#. To stop the server click on the **Port: 5500** button on the status bar. + +Option 2: using the rst extension +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A quick and easy way to see live changes in a ``.rst`` file as you work on it +uses the rst extension with docutils. + +.. note:: This will generate a simple live preview of the document without the + ``html`` theme, and some backlinks might not be added correctly. But it is an + easy and lightweight way to get instant feedback on your work, without + building the html files. + +#. Open any of the source documentation files located in ``doc/source`` in the + editor. +#. Open VSCode Command Palette with :kbd:`Cmd-Shift-P` in Mac or + :kbd:`Ctrl-Shift-P` in Linux and Windows. Start typing "restructured" + and choose either "Open preview" or "Open preview to the Side". + + .. image:: ./gitpod-imgs/vscode-rst.png + :alt: Gitpod workspace VSCode open rst screenshot + +#. As you work on the document, you will see a live rendering of it on the editor. + + .. image:: ./gitpod-imgs/rst-rendering.png + :alt: Gitpod workspace VSCode rst rendering screenshot + +If you want to see the final output with the ``html`` theme you will need to +rebuild the docs with ``make html`` and use Live Serve as described in option 1. + +FAQ's and troubleshooting +------------------------- + +How long is my Gitpod workspace kept for? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Your stopped workspace will be kept for 14 days and deleted afterwards if you do +not use them. + +Can I come back to a previous workspace? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Yes, let's say you stepped away for a while and you want to carry on working on +your pandas contributions. You need to visit https://gitpod.io/workspaces and +click on the workspace you want to spin up again. All your changes will be there +as you last left them. + +Can I install additional VSCode extensions? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Absolutely! Any extensions you installed will be installed in your own workspace +and preserved. + +I registered on Gitpod but I still cannot see a ``Gitpod`` button in my repositories. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Head to https://gitpod.io/integrations and make sure you are logged in. +Hover over GitHub and click on the three buttons that appear on the right. +Click on edit permissions and make sure you have ``user:email``, +``read:user``, and ``public_repo`` checked. Click on **Update Permissions** +and confirm the changes in the GitHub application page. + +.. image:: ./gitpod-imgs/gitpod-edit-permissions-gh.png + :alt: Gitpod integrations - edit GH permissions screenshot + +How long does my workspace stay active if I'm not using it? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you keep your workspace open in a browser tab but don't interact with it, +it will shut down after 30 minutes. If you close the browser tab, it will +shut down after 3 minutes. + +My terminal is blank - there is no cursor and it's completely unresponsive +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Unfortunately this is a known-issue on Gitpod's side. You can sort this +issue in two ways: + +#. Create a new Gitpod workspace altogether. +#. Head to your `Gitpod dashboard `_ and locate + the running workspace. Hover on it and click on the **three dots menu** + and then click on **Stop**. When the workspace is completely stopped you + can click on its name to restart it again. + +.. image:: ./gitpod-imgs/gitpod-dashboard-stop.png + :alt: Gitpod dashboard and workspace menu screenshot + +I authenticated through GitHub but I still cannot commit to the repository through Gitpod. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Head to https://gitpod.io/integrations and make sure you are logged in. +Hover over GitHub and click on the three buttons that appear on the right. +Click on edit permissions and make sure you have ``public_repo`` checked. +Click on **Update Permissions** and confirm the changes in the +GitHub application page. + +.. image:: ./gitpod-imgs/gitpod-edit-permissions-repo.png + :alt: Gitpod integrations - edit GH repository permissions screenshot + +Acknowledgments +--------------- + +This page is lightly adapted from the `NumPy`_ project . + +.. _Gitpod: https://www.gitpod.io/ +.. _pandas repository on GitHub: https://github.com/pandas-dev/pandas +.. _create your own fork: https://help.github.com/en/articles/fork-a-repo +.. _VSCode docs: https://code.visualstudio.com/docs/getstarted/tips-and-tricks +.. _NumPy: https://www.numpy.org/ diff --git a/doc/source/development/copy_on_write.rst b/doc/source/development/copy_on_write.rst new file mode 100644 index 0000000000000..9a2309b8a77a3 --- /dev/null +++ b/doc/source/development/copy_on_write.rst @@ -0,0 +1,42 @@ +.. _copy_on_write_dev: + +{{ header }} + +************* +Copy on write +************* + +Copy on Write is a mechanism to simplify the indexing API and improve +performance through avoiding copies if possible. +CoW means that any DataFrame or Series derived from another in any way always +behaves as a copy. An explanation on how to use Copy on Write efficiently can be +found :ref:`here `. + +Reference tracking +------------------ + +To be able to determine if we have to make a copy when writing into a DataFrame, +we have to be aware if the values are shared with another DataFrame. pandas +keeps track of all ``Blocks`` that share values with another block internally to +be able to tell when a copy needs to be triggered. The reference tracking +mechanism is implemented on the Block level. + +We use a custom reference tracker object, ``BlockValuesRefs``, that keeps +track of every block, whose values share memory with each other. The reference +is held through a weak-reference. Every pair of blocks that share some memory should +point to the same ``BlockValuesRefs`` object. If one block goes out of +scope, the reference to this block dies. As a consequence, the reference tracker +object always knows how many blocks are alive and share memory. + +Whenever a :class:`DataFrame` or :class:`Series` object is sharing data with another +object, it is required that each of those objects have its own BlockManager and Block +objects. Thus, in other words, one Block instance (that is held by a DataFrame, not +necessarily for intermediate objects) should always be uniquely used for only +a single DataFrame/Series object. For example, when you want to use the same +Block for another object, you can create a shallow copy of the Block instance +with ``block.copy(deep=False)`` (which will create a new Block instance with +the same underlying values and which will correctly set up the references). + +We can ask the reference tracking object if there is another block alive that shares +data with us before writing into the values. We can trigger a copy before +writing if there is in fact another block alive. diff --git a/doc/source/development/debugging_extensions.rst b/doc/source/development/debugging_extensions.rst new file mode 100644 index 0000000000000..32cb8f4c4d8cd --- /dev/null +++ b/doc/source/development/debugging_extensions.rst @@ -0,0 +1,131 @@ +.. _debugging_c_extensions: + +{{ header }} + +====================== +Debugging C extensions +====================== + +Pandas uses select C extensions for high performance IO operations. In case you need to debug segfaults or general issues with those extensions, the following steps may be helpful. + +First, be sure to compile the extensions with the appropriate flags to generate debug symbols and remove optimizations. This can be achieved as follows: + +.. code-block:: sh + + python setup.py build_ext --inplace -j4 --with-debugging-symbols + +Using a debugger +================ + +Assuming you are on a Unix-like operating system, you can use either lldb or gdb to debug. The choice between either is largely dependent on your compilation toolchain - typically you would use lldb if using clang and gdb if using gcc. For macOS users, please note that ``gcc`` is on modern systems an alias for ``clang``, so if using Xcode you usually opt for lldb. Regardless of which debugger you choose, please refer to your operating systems instructions on how to install. + +After installing a debugger you can create a script that hits the extension module you are looking to debug. For demonstration purposes, let's assume you have a script called ``debug_testing.py`` with the following contents: + +.. code-block:: python + + import pandas as pd + + pd.DataFrame([[1, 2]]).to_json() + +Place the ``debug_testing.py`` script in the project root and launch a Python process under your debugger. If using lldb: + +.. code-block:: sh + + lldb python + +If using gdb: + +.. code-block:: sh + + gdb python + +Before executing our script, let's set a breakpoint in our JSON serializer in its entry function called ``objToJSON``. The lldb syntax would look as follows: + +.. code-block:: sh + + breakpoint set --name objToJSON + +Similarly for gdb: + +.. code-block:: sh + + break objToJSON + +.. note:: + + You may get a warning that this breakpoint cannot be resolved in lldb. gdb may give a similar warning and prompt you to make the breakpoint on a future library load, which you should say yes to. This should only happen on the very first invocation as the module you wish to debug has not yet been loaded into memory. + +Now go ahead and execute your script: + +.. code-block:: sh + + run .py + +Code execution will halt at the breakpoint defined or at the occurrence of any segfault. LLDB's `GDB to LLDB command map `_ provides a listing of debugger command that you can execute using either debugger. + +Another option to execute the entire test suite under lldb would be to run the following: + +.. code-block:: sh + + lldb -- python -m pytest + +Or for gdb + +.. code-block:: sh + + gdb --args python -m pytest + +Once the process launches, simply type ``run`` and the test suite will begin, stopping at any segmentation fault that may occur. + +Improve debugger printing +========================= + +By default your debug will simply print the type and memory address of a PyObject. Assuming we passed a list containing ``["a", "b"]`` as an argument to a Cython-generated function with parameter ``obj``, debugging that object would look as follows: + +.. code-block:: sh + + (gdb) p __pyx_v_obj + $1 = (PyObject *) 0x5555558b91e0 + +Dereferencing this will yield the standard PyObject struct members of the object, which provides some more visibility + +.. code-block:: sh + + (gdb) p *__pyx_v_obj + $2 = {ob_refcnt = 1, ob_type = 0x5555558b91e0 } + +If you are using gdb, CPython provides an extension that prints out more useful information about the object you are inspecting. The extension can be found in `cpython/Tools/gdb/libpython.py `_; for best results be sure to use the gdb extension from the CPython branch that matches the version of your interpreter. + +To activate the extension you will need to execute ``source /Tools/gdb/libpython.py`` from an actively-running gdb session. After loading you will get more detailed information about the Python object you are inspecting. + +.. code-block:: sh + + (gdb) p __pyx_v_obj + $3 = ['a', 'b'] + +If you do not wish to explicitly source this file on every gdb run, you can alternately add it as a start up command to your `gdbinit `_ file. + +Checking memory leaks with valgrind +=================================== + +You can use `Valgrind `_ to check for and log memory leaks in extensions. For instance, to check for a memory leak in a test from the suite you can run: + +.. code-block:: sh + + PYTHONMALLOC=malloc valgrind --leak-check=yes --track-origins=yes --log-file=valgrind-log.txt python -m pytest + +Note that code execution under valgrind will take much longer than usual. While you can run valgrind against extensions compiled with any optimization level, it is suggested to have optimizations turned off from compiled extensions to reduce the amount of false positives. The ``--with-debugging-symbols`` flag passed during package setup will do this for you automatically. + +.. note:: + + For best results, you should run use a Python installation configured with Valgrind support (--with-valgrind) + + +Easier code navigation +====================== + +Generating a ``compile_commands.json`` file may make it easier to navigate the C extensions, as this allows your code editor to list references, jump to definitions, etc... To make this work with setuptools you can use `Bear `_. + +.. code-block:: sh + + bear -- python setup.py build_ext --inplace -j4 --with-debugging-symbols diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index fbd83af3de82e..6de237b70f08d 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -71,11 +71,13 @@ descriptor format for these as is follows: .. code-block:: python index = pd.RangeIndex(0, 10, 2) - {'kind': 'range', - 'name': index.name, - 'start': index.start, - 'stop': index.stop, - 'step': index.step} + { + "kind": "range", + "name": index.name, + "start": index.start, + "stop": index.stop, + "step": index.step, + } Other index types must be serialized as data columns along with the other DataFrame columns. The metadata for these is a string indicating the name of @@ -178,8 +180,8 @@ As an example of fully-formed metadata: 'numpy_type': 'int64', 'metadata': None} ], - 'pandas_version': '0.20.0', + 'pandas_version': '1.4.0', 'creator': { 'library': 'pyarrow', 'version': '0.13.0' - }} \ No newline at end of file + }} diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 1e6b2c646fdfd..c7286616672b9 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -34,7 +34,7 @@ decorate a class, providing the name of attribute to add. The class's @staticmethod def _validate(obj): # verify there is a column latitude and a column longitude - if 'latitude' not in obj.columns or 'longitude' not in obj.columns: + if "latitude" not in obj.columns or "longitude" not in obj.columns: raise AttributeError("Must have 'latitude' and 'longitude'.") @property @@ -50,8 +50,9 @@ decorate a class, providing the name of attribute to add. The class's Now users can access your methods using the ``geo`` namespace: - >>> ds = pd.DataFrame({'longitude': np.linspace(0, 10), - ... 'latitude': np.linspace(0, 20)}) + >>> ds = pd.DataFrame( + ... {"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)} + ... ) >>> ds.geo.center (5.0, 10.0) >>> ds.geo.plot() @@ -61,7 +62,7 @@ This can be a convenient way to extend pandas objects without subclassing them. If you write a custom accessor, make a pull request adding it to our :ref:`ecosystem` page. -We highly recommend validating the data in your accessor's `__init__`. +We highly recommend validating the data in your accessor's ``__init__``. In our ``GeoAccessor``, we validate that the data contains the expected columns, raising an ``AttributeError`` when the validation fails. For a ``Series`` accessor, you should validate the ``dtype`` if the accessor @@ -73,12 +74,11 @@ applies only to certain dtypes. Extension types --------------- -.. versionadded:: 0.23.0 - -.. warning:: +.. note:: - The :class:`pandas.api.extensions.ExtensionDtype` and :class:`pandas.api.extensions.ExtensionArray` APIs are new and - experimental. They may change between versions without warning. + The :class:`pandas.api.extensions.ExtensionDtype` and :class:`pandas.api.extensions.ExtensionArray` APIs were + experimental prior to pandas 1.5. Starting with version 1.5, future changes will follow + the :ref:`pandas deprecation policy `. pandas defines an interface for implementing data types and arrays that *extend* NumPy's type system. pandas itself uses the extension system for some types @@ -107,9 +107,7 @@ extension array for IP Address data, this might be ``ipaddress.IPv4Address``. See the `extension dtype source`_ for interface definition. -.. versionadded:: 0.24.0 - -:class:`pandas.api.extension.ExtensionDtype` can be registered to pandas to allow creation via a string dtype name. +:class:`pandas.api.extensions.ExtensionDtype` can be registered to pandas to allow creation via a string dtype name. This allows one to instantiate ``Series`` and ``.astype()`` with a registered string name, for example ``'category'`` is a registered string accessor for the ``CategoricalDtype``. @@ -128,7 +126,7 @@ data. We do require that your array be convertible to a NumPy array, even if this is relatively expensive (as it is for ``Categorical``). They may be backed by none, one, or many NumPy arrays. For example, -``pandas.Categorical`` is an extension array backed by two arrays, +:class:`pandas.Categorical` is an extension array backed by two arrays, one for codes and one for categories. An array of IPv6 addresses may be backed by a NumPy structured array with two fields, one for the lower 64 bits and one for the upper 64 bits. Or they may be backed @@ -142,8 +140,6 @@ and comments contain guidance for properly implementing the interface. :class:`~pandas.api.extensions.ExtensionArray` operator support ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. versionadded:: 0.24.0 - By default, there are no operators defined for the class :class:`~pandas.api.extensions.ExtensionArray`. There are two approaches for providing operator support for your ExtensionArray: @@ -178,6 +174,7 @@ your ``MyExtensionArray`` class, as follows: from pandas.api.extensions import ExtensionArray, ExtensionScalarOpsMixin + class MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin): pass @@ -219,7 +216,7 @@ and re-boxes it if necessary. If applicable, we highly recommend that you implement ``__array_ufunc__`` in your extension array to avoid coercion to an ndarray. See -`the numpy documentation `__ +`the NumPy documentation `__ for an example. As part of your implementation, we require that you defer to pandas when a pandas @@ -235,7 +232,7 @@ Testing extension arrays We provide a test suite for ensuring that your extension arrays satisfy the expected behavior. To use the test suite, you must provide several pytest fixtures and inherit from the base test class. The required fixtures are found in -https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/conftest.py. +https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/conftest.py. To use a test, subclass it: @@ -248,7 +245,7 @@ To use a test, subclass it: pass -See https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/base/__init__.py +See https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/base/__init__.py for a list of all the tests available. .. _extending.extension.arrow: @@ -273,6 +270,7 @@ included as a column in a pandas DataFrame): def __arrow_array__(self, type=None): # convert the underlying array values to a pyarrow Array import pyarrow + return pyarrow.array(..., type=type) The ``ExtensionDtype.__from_arrow__`` method then controls the conversion @@ -293,9 +291,9 @@ See more in the `Arrow documentation >> type(to_framed) - >>> df = SubclassedDataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df A B C 0 1 4 7 @@ -389,7 +375,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(df) - >>> sliced1 = df[['A', 'B']] + >>> sliced1 = df[["A", "B"]] >>> sliced1 A B 0 1 4 @@ -399,7 +385,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(sliced1) - >>> sliced2 = df['A'] + >>> sliced2 = df["A"] >>> sliced2 0 1 1 2 @@ -424,11 +410,11 @@ Below is an example to define two original properties, "internal_cache" as a tem class SubclassedDataFrame2(pd.DataFrame): # temporary properties - _internal_names = pd.DataFrame._internal_names + ['internal_cache'] + _internal_names = pd.DataFrame._internal_names + ["internal_cache"] _internal_names_set = set(_internal_names) # normal properties - _metadata = ['added_property'] + _metadata = ["added_property"] @property def _constructor(self): @@ -436,15 +422,15 @@ Below is an example to define two original properties, "internal_cache" as a tem .. code-block:: python - >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame2({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df A B C 0 1 4 7 1 2 5 8 2 3 6 9 - >>> df.internal_cache = 'cached' - >>> df.added_property = 'property' + >>> df.internal_cache = "cached" + >>> df.added_property = "property" >>> df.internal_cache cached @@ -452,11 +438,11 @@ Below is an example to define two original properties, "internal_cache" as a tem property # properties defined in _internal_names is reset after manipulation - >>> df[['A', 'B']].internal_cache + >>> df[["A", "B"]].internal_cache AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' # properties defined in _metadata are retained - >>> df[['A', 'B']].added_property + >>> df[["A", "B"]].added_property property .. _extending.plotting-backends: @@ -470,7 +456,7 @@ one based on Matplotlib. For example: .. code-block:: python - >>> pd.set_option('plotting.backend', 'backend.module') + >>> pd.set_option("plotting.backend", "backend.module") >>> pd.Series([1, 2, 3]).plot() This would be more or less equivalent to: @@ -483,7 +469,7 @@ This would be more or less equivalent to: The backend module can then use other visualization tools (Bokeh, Altair,...) to generate the plots. -Libraries implementing the plotting backend should use `entry points `__ +Libraries implementing the plotting backend should use `entry points `__ to make their backend discoverable to pandas. The key is ``"pandas_plotting_backends"``. For example, pandas registers the default "matplotlib" backend as follows. @@ -501,4 +487,4 @@ registers the default "matplotlib" backend as follows. More information on how to implement a third-party plotting backend can be found at -https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1. \ No newline at end of file +https://github.com/pandas-dev/pandas/blob/main/pandas/plotting/__init__.py#L1. diff --git a/doc/source/development/gitpod-imgs/gitpod-dashboard-stop.png b/doc/source/development/gitpod-imgs/gitpod-dashboard-stop.png new file mode 100644 index 0000000000000..b64790a986646 Binary files /dev/null and b/doc/source/development/gitpod-imgs/gitpod-dashboard-stop.png differ diff --git a/doc/source/development/gitpod-imgs/gitpod-edit-permissions-gh.png b/doc/source/development/gitpod-imgs/gitpod-edit-permissions-gh.png new file mode 100644 index 0000000000000..ec21a9064c83d Binary files /dev/null and b/doc/source/development/gitpod-imgs/gitpod-edit-permissions-gh.png differ diff --git a/doc/source/development/gitpod-imgs/gitpod-edit-permissions-repo.png b/doc/source/development/gitpod-imgs/gitpod-edit-permissions-repo.png new file mode 100644 index 0000000000000..8bfaff81cfb69 Binary files /dev/null and b/doc/source/development/gitpod-imgs/gitpod-edit-permissions-repo.png differ diff --git a/doc/source/development/gitpod-imgs/gitpod-workspace.png b/doc/source/development/gitpod-imgs/gitpod-workspace.png new file mode 100644 index 0000000000000..daf763e9adb05 Binary files /dev/null and b/doc/source/development/gitpod-imgs/gitpod-workspace.png differ diff --git a/doc/source/development/gitpod-imgs/pandas-github.png b/doc/source/development/gitpod-imgs/pandas-github.png new file mode 100644 index 0000000000000..010b0fc5ea33d Binary files /dev/null and b/doc/source/development/gitpod-imgs/pandas-github.png differ diff --git a/doc/source/development/gitpod-imgs/pandas-gitpod-branches.png b/doc/source/development/gitpod-imgs/pandas-gitpod-branches.png new file mode 100644 index 0000000000000..f95c66056ca37 Binary files /dev/null and b/doc/source/development/gitpod-imgs/pandas-gitpod-branches.png differ diff --git a/doc/source/development/gitpod-imgs/rst-rendering.png b/doc/source/development/gitpod-imgs/rst-rendering.png new file mode 100644 index 0000000000000..b613c621c398b Binary files /dev/null and b/doc/source/development/gitpod-imgs/rst-rendering.png differ diff --git a/doc/source/development/gitpod-imgs/vscode-rst.png b/doc/source/development/gitpod-imgs/vscode-rst.png new file mode 100644 index 0000000000000..5b574c115a2b7 Binary files /dev/null and b/doc/source/development/gitpod-imgs/vscode-rst.png differ diff --git a/doc/source/development/gitpod-imgs/vscode-statusbar.png b/doc/source/development/gitpod-imgs/vscode-statusbar.png new file mode 100644 index 0000000000000..dad25369fedfd Binary files /dev/null and b/doc/source/development/gitpod-imgs/vscode-statusbar.png differ diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index f8a6bb6deb52d..69f04494a271c 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -13,11 +13,15 @@ Development :maxdepth: 2 contributing - code_style + contributing_environment + contributing_documentation + contributing_codebase maintaining internals + copy_on_write + debugging_extensions extending developer policies roadmap - meeting + community diff --git a/doc/source/development/internals.rst b/doc/source/development/internals.rst index 8f1c3d5d818c2..3dd687ef2087d 100644 --- a/doc/source/development/internals.rst +++ b/doc/source/development/internals.rst @@ -15,24 +15,21 @@ Indexing In pandas there are a few objects implemented which can serve as valid containers for the axis labels: -* ``Index``: the generic "ordered set" object, an ndarray of object dtype +* :class:`Index`: the generic "ordered set" object, an ndarray of object dtype assuming nothing about its contents. The labels must be hashable (and likely immutable) and unique. Populates a dict of label to location in Cython to do ``O(1)`` lookups. -* ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer - data, such as time stamps -* ``Float64Index``: a version of ``Index`` highly optimized for 64-bit float data -* ``MultiIndex``: the standard hierarchical index object -* ``DatetimeIndex``: An Index object with ``Timestamp`` boxed elements (impl are the int64 values) -* ``TimedeltaIndex``: An Index object with ``Timedelta`` boxed elements (impl are the in64 values) -* ``PeriodIndex``: An Index object with Period elements +* :class:`MultiIndex`: the standard hierarchical index object +* :class:`DatetimeIndex`: An Index object with :class:`Timestamp` boxed elements (impl are the int64 values) +* :class:`TimedeltaIndex`: An Index object with :class:`Timedelta` boxed elements (impl are the in64 values) +* :class:`PeriodIndex`: An Index object with Period elements There are functions that make the creation of a regular index easy: -* ``date_range``: fixed frequency date range generated from a time rule or +* :func:`date_range`: fixed frequency date range generated from a time rule or DateOffset. An ndarray of Python datetime objects -* ``period_range``: fixed frequency date range generated from a time rule or - DateOffset. An ndarray of ``Period`` objects, representing timespans +* :func:`period_range`: fixed frequency date range generated from a time rule or + DateOffset. An ndarray of :class:`Period` objects, representing timespans The motivation for having an ``Index`` class in the first place was to enable different implementations of indexing. This means that it's possible for you, @@ -43,33 +40,34 @@ From an internal implementation point of view, the relevant methods that an ``Index`` must define are one or more of the following (depending on how incompatible the new object internals are with the ``Index`` functions): -* ``get_loc``: returns an "indexer" (an integer, or in some cases a +* :meth:`~Index.get_loc`: returns an "indexer" (an integer, or in some cases a slice object) for a label -* ``slice_locs``: returns the "range" to slice between two labels -* ``get_indexer``: Computes the indexing vector for reindexing / data +* :meth:`~Index.slice_locs`: returns the "range" to slice between two labels +* :meth:`~Index.get_indexer`: Computes the indexing vector for reindexing / data alignment purposes. See the source / docstrings for more on this -* ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data +* :meth:`~Index.get_indexer_non_unique`: Computes the indexing vector for reindexing / data alignment purposes when the index is non-unique. See the source / docstrings for more on this -* ``reindex``: Does any pre-conversion of the input index then calls +* :meth:`~Index.reindex`: Does any pre-conversion of the input index then calls ``get_indexer`` -* ``union``, ``intersection``: computes the union or intersection of two +* :meth:`~Index.union`, :meth:`~Index.intersection`: computes the union or intersection of two Index objects -* ``insert``: Inserts a new label into an Index, yielding a new object -* ``delete``: Delete a label, yielding a new object -* ``drop``: Deletes a set of labels -* ``take``: Analogous to ndarray.take +* :meth:`~Index.insert`: Inserts a new label into an Index, yielding a new object +* :meth:`~Index.delete`: Delete a label, yielding a new object +* :meth:`~Index.drop`: Deletes a set of labels +* :meth:`~Index.take`: Analogous to ndarray.take MultiIndex ~~~~~~~~~~ -Internally, the ``MultiIndex`` consists of a few things: the **levels**, the -integer **codes** (until version 0.24 named *labels*), and the level **names**: +Internally, the :class:`MultiIndex` consists of a few things: the **levels**, the +integer **codes**, and the level **names**: .. ipython:: python - index = pd.MultiIndex.from_product([range(3), ['one', 'two']], - names=['first', 'second']) + index = pd.MultiIndex.from_product( + [range(3), ["one", "two"]], names=["first", "second"] + ) index index.levels index.codes @@ -79,13 +77,13 @@ You can probably guess that the codes determine which unique element is identified with that location at each layer of the index. It's important to note that sortedness is determined **solely** from the integer codes and does not check (or care) whether the levels themselves are sorted. Fortunately, the -constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but -if you compute the levels and codes yourself, please be careful. +constructors :meth:`~MultiIndex.from_tuples` and :meth:`~MultiIndex.from_arrays` ensure +that this is true, but if you compute the levels and codes yourself, please be careful. Values ~~~~~~ -pandas extends NumPy's type system with custom types, like ``Categorical`` or +pandas extends NumPy's type system with custom types, like :class:`Categorical` or datetimes with a timezone, so we have multiple notions of "values". For 1-D containers (``Index`` classes and ``Series``) we have the following convention: diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index 9f9e9dc2631f3..994dfde0894f3 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -121,6 +121,49 @@ Here's a typical workflow for triaging a newly opened issue. unless it's know that this issue should be addressed in a specific release (say because it's a large regression). +.. _maintaining.regressions: + +Investigating regressions +------------------------- + +Regressions are bugs that unintentionally break previously working code. The common way +to investigate regressions is by using +`git bisect `_, +which finds the first commit that introduced the bug. + +For example: a user reports that ``pd.Series([1, 1]).sum()`` returns ``3`` +in pandas version ``1.5.0`` while in version ``1.4.0`` it returned ``2``. To begin, +create a file ``t.py`` in your pandas directory, which contains + +.. code-block:: python + + import pandas as pd + assert pd.Series([1, 1]).sum() == 2 + +and then run:: + + git bisect start + git bisect good v1.4.0 + git bisect bad v1.5.0 + git bisect run bash -c "python setup.py build_ext -j 4; python t.py" + +This finds the first commit that changed the behavior. The C extensions have to be +rebuilt at every step, so the search can take a while. + +Exit bisect and rebuild the current version:: + + git bisect reset + python setup.py build_ext -j 4 + +Report your findings under the corresponding issue and ping the commit author to get +their input. + +.. note:: + In the ``bisect run`` command above, commits are considered good if ``t.py`` exits + with ``0`` and bad otherwise. When raising an exception is the desired behavior, + wrap the code in an appropriate ``try/except`` statement. See :issue:`35685` for + more examples. + .. _maintaining.closing: Closing issues @@ -132,17 +175,56 @@ respond or self-close their issue if it's determined that the behavior is not a or the feature is out of scope. Sometimes reporters just go away though, and we'll close the issue after the conversation has died. +.. _maintaining.reviewing: + Reviewing pull requests ----------------------- Anybody can review a pull request: regular contributors, triagers, or core-team -members. Here are some guidelines to check. +members. But only core-team members can merge pull requests when they're ready. + +Here are some things to check when reviewing a pull request. -* Tests should be in a sensible location. +* Tests should be in a sensible location: in the same file as closely related tests. * New public APIs should be included somewhere in ``doc/source/reference/``. * New / changed API should use the ``versionadded`` or ``versionchanged`` directives in the docstring. * User-facing changes should have a whatsnew in the appropriate file. * Regression tests should reference the original GitHub issue number like ``# GH-1234``. +* The pull request should be labeled and assigned the appropriate milestone (the next patch release + for regression fixes and small bug fixes, the next minor milestone otherwise) +* Changes should comply with our :ref:`policies.version`. + + +.. _maintaining.backporting: + +Backporting +----------- + +pandas supports point releases (e.g. ``1.4.3``) that aim to: + +1. Fix bugs in new features introduced in the first minor version release. + + * e.g. If a new feature was added in ``1.4`` and contains a bug, a fix can be applied in ``1.4.3`` + +2. Fix bugs that used to work in a few minor releases prior. There should be agreement between core team members that a backport is appropriate. + + * e.g. If a feature worked in ``1.2`` and stopped working since ``1.3``, a fix can be applied in ``1.4.3``. + +Since pandas minor releases are based on GitHub branches (e.g. point release of ``1.4`` are based off the ``1.4.x`` branch), +"backporting" means merging a pull request fix to the ``main`` branch and correct minor branch associated with the next point release. + +By default, if a pull request is assigned to the next point release milestone within the GitHub interface, +the backporting process should happen automatically by the ``@meeseeksdev`` bot once the pull request is merged. +A new pull request will be made backporting the pull request to the correct version branch. +Sometimes due to merge conflicts, a manual pull request will need to be made addressing the code conflict. + +If the bot does not automatically start the backporting process, you can also write a GitHub comment in the merged pull request +to trigger the backport:: + + @meeseeksdev backport version-branch + +This will trigger a workflow which will backport a given change to a branch +(e.g. @meeseeksdev backport 1.4.x) Cleaning up old issues ---------------------- @@ -186,8 +268,233 @@ The full process is outlined in our `governance documents`_. In summary, we're happy to give triage permissions to anyone who shows interest by being helpful on the issue tracker. +The required steps for adding a maintainer are: + +1. Contact the contributor and ask their interest to join. +2. Add the contributor to the appropriate `GitHub Team `_ if accepted the invitation. + + * ``pandas-core`` is for core team members + * ``pandas-triage`` is for pandas triage members + +3. Add the contributor to the pandas Google group. +4. Create a pull request to add the contributor's GitHub handle to ``pandas-dev/pandas/web/pandas/config.yml``. +5. Create a pull request to add the contributor's name/GitHub handle to the `governance document `_. + The current list of core-team members is at https://github.com/pandas-dev/pandas-governance/blob/master/people.md + +.. _maintaining.merging: + +Merging pull requests +--------------------- + +Only core team members can merge pull requests. We have a few guidelines. + +1. You should typically not self-merge your own pull requests. Exceptions include + things like small changes to fix CI (e.g. pinning a package version). +2. You should not merge pull requests that have an active discussion, or pull + requests that has any ``-1`` votes from a core maintainer. pandas operates + by consensus. +3. For larger changes, it's good to have a +1 from at least two core team members. + +In addition to the items listed in :ref:`maintaining.closing`, you should verify +that the pull request is assigned the correct milestone. + +Pull requests merged with a patch-release milestone will typically be backported +by our bot. Verify that the bot noticed the merge (it will leave a comment within +a minute typically). If a manual backport is needed please do that, and remove +the "Needs backport" label once you've done it manually. If you forget to assign +a milestone before tagging, you can request the bot to backport it with: + +.. code-block:: console + + @Meeseeksdev backport + + +.. _maintaining.asv-machine: + +Benchmark machine +----------------- + +The team currently owns dedicated hardware for hosting a website for pandas' ASV performance benchmark. The results +are published to https://asv-runner.github.io/asv-collection/pandas/ + +Configuration +````````````` + +The machine can be configured with the `Ansible `_ playbook in https://github.com/tomaugspurger/asv-runner. + +Publishing +`````````` + +The results are published to another GitHub repository, https://github.com/tomaugspurger/asv-collection. +Finally, we have a cron job on our docs server to pull from https://github.com/tomaugspurger/asv-collection, to serve them from ``/speed``. +Ask Tom or Joris for access to the webserver. + +Debugging +````````` + +The benchmarks are scheduled by Airflow. It has a dashboard for viewing and debugging the results. You'll need to setup an SSH tunnel to view them + + ssh -L 8080:localhost:8080 pandas@panda.likescandy.com + + +.. _maintaining.release: + +Release process +--------------- + +The release process makes a snapshot of pandas (a git commit) available to users with +a particular version number. After the release the new pandas version will be available +in the next places: + +- Git repo with a `new tag `_ +- Source distribution in a `GitHub release `_ +- Pip packages in the `PyPI `_ +- Conda/Mamba packages in `conda-forge `_ + +The process for releasing a new version of pandas is detailed next section. + +The instructions contain ```` which needs to be replaced with the version +to be released (e.g. ``1.5.2``). Also the branch to be released ````, which +depends on whether the version being released is the release candidate of a new version, +or any other version. Release candidates are released from ``main``, while other +versions are released from their branch (e.g. ``1.5.x``). + + +Prerequisites +````````````` + +In order to be able to release a new pandas version, the next permissions are needed: + +- Merge rights to the `pandas `_, + `pandas-wheels `_, and + `pandas-feedstock `_ repositories. +- Permissions to push to main in the pandas repository, to push the new tags. +- `Write permissions to PyPI `_ +- Access to the social media accounts, to publish the announcements. + +Pre-release +``````````` + +1. Agree with the core team on the next topics: + + - Release date (major/minor releases happen usually every 6 months, and patch releases + monthly until x.x.5, just before the next major/minor) + - Blockers (issues and PRs that must be part of the release) + - Next version after the one being released + +2. Update and clean release notes for the version to be released, including: + + - Set the final date of the release + - Remove any unused bullet point + - Make sure there are no formatting issues, typos, etc. + +3. Make sure the CI is green for the last commit of the branch being released. + +4. If not a release candidate, make sure all backporting pull requests to the branch + being released are merged. + +5. Create a new issue and milestone for the version after the one being released. + If the release was a release candidate, we would usually want to create issues and + milestones for both the next major/minor, and the next patch release. In the + milestone of a patch release, we add the description ``on-merge: backport to ``, + so tagged PRs are automatically backported to the release branch by our bot. + +6. Change the milestone of all issues and PRs in the milestone being released to the + next milestone. + +Release +``````` + +1. Create an empty commit and a tag in the last commit of the branch to be released:: + + git checkout + git pull --ff-only upstream + git clean -xdf + git commit --allow-empty --author="Pandas Development Team " -m "RLS: " + git tag -a v -m "Version " # NOTE that the tag is v1.5.2 with "v" not 1.5.2 + git push upstream --follow-tags + +The docs for the new version will be built and published automatically with the docs job in the CI, +which will be triggered when the tag is pushed. + +2. Only if the release is a release candidate, we want to create a new branch for it, immediately + after creating the tag. For example, if we are releasing pandas 1.4.0rc0, we would like to + create the branch 1.4.x to backport commits to the 1.4 versions. As well as create a tag to + mark the start of the development of 1.5.0 (assuming it is the next version):: + + git checkout -b 1.4.x + git push upstream 1.4.x + git checkout main + git commit --allow-empty -m "Start 1.5.0" + git tag -a v1.5.0.dev0 -m "DEV: Start 1.5.0" + git push upstream main --follow-tags + +3. Build the source distribution (git must be in the tag commit): + + ./setup.py sdist --formats=gztar --quiet + +4. Create a `new GitHub release `_: + + - Title: ``Pandas `` + - Tag: ```` + - Files: ``pandas-.tar.gz`` source distribution just generated + - Description: Copy the description of the last release of the same kind (release candidate, major/minor or patch release) + - Set as a pre-release: Only check for a release candidate + - Set as the latest release: Leave checked, unless releasing a patch release for an older version + (e.g. releasing 1.4.5 after 1.5 has been released) + +5. The GitHub release will after some hours trigger an + `automated conda-forge PR `_. + Merge it once the CI is green, and it will generate the conda-forge packages. + +6. Packages for supported versions in PyPI are built in the + `MacPython repo `_. + Open a PR updating the build commit to the released version, and merge it once the + CI is green. To do this type:: + + git checkout master + git pull --ff-only upstream master + git checkout -B RLS- + sed -i 's/BUILD_COMMIT: "v.*/BUILD_COMMIT: "'v'"/' azure/windows.yml azure/posix.yml + sed -i 's/BUILD_COMMIT="v.*/BUILD_COMMIT="'v'"/' .travis.yml + git commit -am "RLS " + git push -u origin RLS- + +7. Download all wheels from the Anaconda repository where MacPython uploads them: + https://anaconda.org/multibuild-wheels-staging/pandas/files?version= + to the ``dist/`` directory in the local pandas copy. You can use the script + ``scripts/download_wheels.sh`` to download all wheels at once. + +8. Upload wheels to PyPI: + + twine upload pandas/dist/pandas-*.{whl,tar.gz} --skip-existing + +Post-Release +```````````` + +1. Update symlink to stable documentation by logging in to our web server, and + editing ``/var/www/html/pandas-docs/stable`` to point to ``version/``. + +2. If releasing a major or minor release, open a PR in our source code to update + ``web/pandas/versions.json``, to have the desired versions in the documentation + dropdown menu. + +3. Close the milestone and the issue for the released version. + +4. Create a new issue for the next release, with the estimated date of release. + +5. Open a PR with the placeholder for the release notes of the next version. See + for example `the PR for 1.5.3 `_. + +6. Announce the new release in the official channels (use previous announcements + for reference): + + - The pandas-dev and pydata mailing lists + - Twitter, Mastodon and Telegram + + .. _governance documents: https://github.com/pandas-dev/pandas-governance -.. _list of permissions: https://help.github.com/en/github/setting-up-and-managing-organizations-and-teams/repository-permission-levels-for-an-organization \ No newline at end of file +.. _list of permissions: https://docs.github.com/en/organizations/managing-access-to-your-organizations-repositories/repository-roles-for-an-organization diff --git a/doc/source/development/meeting.rst b/doc/source/development/meeting.rst deleted file mode 100644 index 35826af5912c2..0000000000000 --- a/doc/source/development/meeting.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. _meeting: - -================== -Developer meetings -================== - -We hold regular developer meetings on the second Wednesday -of each month at 18:00 UTC. These meetings and their minutes are open to -the public. All are welcome to join. - -Minutes -------- - -The minutes of past meetings are available in `this Google Document `__. - -Calendar --------- - -This calendar shows all the developer meetings. - -.. raw:: html - - - -You can subscribe to this calendar with the following links: - -* `iCal `__ -* `Google calendar `__ - -Additionally, we'll sometimes have one-off meetings on specific topics. -These will be published on the same calendar. diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index 1031bbfc46457..d079cc59b0ca5 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -9,14 +9,12 @@ Policies Version policy ~~~~~~~~~~~~~~ -.. versionchanged:: 1.0.0 - pandas uses a loose variant of semantic versioning (`SemVer`_) to govern deprecations, API compatibility, and version numbering. A pandas release number is made up of ``MAJOR.MINOR.PATCH``. -API breaking changes should only occur in **major** releases. Theses changes +API breaking changes should only occur in **major** releases. These changes will be documented, with clear guidance on what is changing, why it's changing, and how to migrate existing code to the new behavior. @@ -35,7 +33,7 @@ We will not introduce new deprecations in patch releases. Deprecations will only be enforced in **major** releases. For example, if a behavior is deprecated in pandas 1.2.0, it will continue to work, with a warning, for all releases in the 1.x series. The behavior will change and the -deprecation removed in the next next major release (2.0.0). +deprecation removed in the next major release (2.0.0). .. note:: @@ -51,7 +49,7 @@ pandas may change the behavior of experimental features at any time. Python support ~~~~~~~~~~~~~~ -pandas will only drop support for specific Python versions (e.g. 3.6.x, 3.7.x) in -pandas **major** releases. +pandas mirrors the `NumPy guidelines for Python support `__. + .. _SemVer: https://semver.org diff --git a/doc/source/development/roadmap.rst b/doc/source/development/roadmap.rst index d331491d02883..f935c27d9917d 100644 --- a/doc/source/development/roadmap.rst +++ b/doc/source/development/roadmap.rst @@ -53,6 +53,31 @@ need to implement certain operations expected by pandas users (for example the algorithm used in, ``Series.str.upper``). That work may be done outside of pandas. +Consistent missing value handling +--------------------------------- + +Currently, pandas handles missing data differently for different data types. We +use different types to indicate that a value is missing (``np.nan`` for +floating-point data, ``np.nan`` or ``None`` for object-dtype data -- typically +strings or booleans -- with missing values, and ``pd.NaT`` for datetimelike +data). Integer data cannot store missing data or are cast to float. In addition, +pandas 1.0 introduced a new missing value sentinel, ``pd.NA``, which is being +used for the experimental nullable integer, boolean, and string data types. + +These different missing values have different behaviors in user-facing +operations. Specifically, we introduced different semantics for the nullable +data types for certain operations (e.g. propagating in comparison operations +instead of comparing as False). + +Long term, we want to introduce consistent missing data handling for all data +types. This includes consistent behavior in all operations (indexing, arithmetic +operations, comparisons, etc.). There has been discussion of eventually making +the new semantics the default. + +This has been discussed at :issue:`28095` (and +linked issues), and described in more detail in this +`design doc `__. + Apache Arrow interoperability ----------------------------- @@ -103,8 +128,51 @@ We propose that it should only work with positional indexing, and the translatio to positions should be entirely done at a higher level. Indexing is a complicated API with many subtleties. This refactor will require care -and attention. More details are discussed at -https://github.com/pandas-dev/pandas/wiki/(Tentative)-rules-for-restructuring-indexing-code +and attention. The following principles should inspire refactoring of indexing code and +should result on cleaner, simpler, and more performant code. + +1. **Label indexing must never involve looking in an axis twice for the same label(s).** +This implies that any validation step must either: + + * limit validation to general features (e.g. dtype/structure of the key/index), or + * reuse the result for the actual indexing. + +2. **Indexers must never rely on an explicit call to other indexers.** +For instance, it is OK to have some internal method of ``.loc`` call some +internal method of ``__getitem__`` (or of their common base class), +but never in the code flow of ``.loc`` should ``the_obj[something]`` appear. + +3. **Execution of positional indexing must never involve labels** (as currently, sadly, happens). +That is, the code flow of a getter call (or a setter call in which the right hand side is non-indexed) +to ``.iloc`` should never involve the axes of the object in any way. + +4. **Indexing must never involve accessing/modifying values** (i.e., act on ``._data`` or ``.values``) **more than once.** +The following steps must hence be clearly decoupled: + + * find positions we need to access/modify on each axis + * (if we are accessing) derive the type of object we need to return (dimensionality) + * actually access/modify the values + * (if we are accessing) construct the return object + +5. As a corollary to the decoupling between 4.i and 4.iii, **any code which deals on how data is stored** +(including any combination of handling multiple dtypes, and sparse storage, categoricals, third-party types) +**must be independent from code that deals with identifying affected rows/columns**, +and take place only once step 4.i is completed. + + * In particular, such code should most probably not live in ``pandas/core/indexing.py`` + * ... and must not depend in any way on the type(s) of axes (e.g. no ``MultiIndex`` special cases) + +6. As a corollary to point 1.i, **``Index`` (sub)classes must provide separate methods for any desired validity check of label(s) which does not involve actual lookup**, +on the one side, and for any required conversion/adaptation/lookup of label(s), on the other. + +7. **Use of trial and error should be limited**, and anyway restricted to catch only exceptions +which are actually expected (typically ``KeyError``). + + * In particular, code should never (intentionally) raise new exceptions in the ``except`` portion of a ``try... exception`` + +8. **Any code portion which is not specific to setters and getters must be shared**, +and when small differences in behavior are expected (e.g. getting with ``.loc`` raises for +missing labels, setting still doesn't), they can be managed with a specific parameter. Numba-accelerated operations ---------------------------- @@ -115,20 +183,6 @@ ways for users to apply their own Numba-jitted functions where pandas accepts us and in groupby and window contexts). This will improve the performance of user-defined-functions in these operations by staying within compiled code. - -Documentation improvements --------------------------- - -We'd like to improve the content, structure, and presentation of the pandas documentation. -Some specific goals include - -* Overhaul the HTML theme with a modern, responsive design (:issue:`15556`) -* Improve the "Getting Started" documentation, designing and writing learning paths - for users different backgrounds (e.g. brand new to programming, familiar with - other languages like R, already familiar with Python). -* Improve the overall organization of the documentation and specific subsections - of the documentation to make navigation and finding content easier. - Performance monitoring ---------------------- @@ -177,3 +231,20 @@ should be notified of the proposal. When there's agreement that an implementation would be welcome, the roadmap should be updated to include the summary and a link to the discussion issue. + +Completed items +--------------- + +This section records now completed items from the pandas roadmap. + +Documentation improvements +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We improved the pandas documentation + +* The pandas community worked with others to build the `pydata-sphinx-theme`_, + which is now used for https://pandas.pydata.org/docs/ (:issue:`15556`). +* :ref:`getting_started` contains a number of resources intended for new + pandas users coming from a variety of backgrounds (:issue:`26831`). + +.. _pydata-sphinx-theme: https://github.com/pydata/pydata-sphinx-theme diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index b02d4abd3ddf8..53b7aae3f7ab1 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -19,7 +19,7 @@ development to remain focused around it's original requirements. This is an inexhaustive list of projects that build on pandas in order to provide tools in the PyData space. For a list of projects that depend on pandas, see the -`libraries.io usage page for pandas `_ +`GitHub network dependents for pandas `_ or `search pypi for pandas `_. We'd like to make it easier for users to find these projects, if you know of other @@ -30,16 +30,18 @@ substantial projects that you feel should be on this list, please let us know. Data cleaning and validation ---------------------------- -`Pyjanitor `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Pyjanitor `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Pyjanitor provides a clean API for cleaning data, using method chaining. -`Engarde `__ +`Pandera `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Engarde is a lightweight library used to explicitly state assumptions about your datasets -and check that they're *actually* true. +Pandera provides a flexible and expressive API for performing data validation on dataframes +to make data processing pipelines more readable and robust. +Dataframes contain information that pandera explicitly validates at runtime. This is useful in +production-critical data pipelines or reproducible research settings. `pandas-path `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -69,22 +71,39 @@ a long-standing special relationship with pandas. Statsmodels provides powerful econometrics, analysis and modeling functionality that is out of pandas' scope. Statsmodels leverages pandas objects as the underlying data container for computation. -`sklearn-pandas `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`sklearn-pandas `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Use pandas DataFrames in your `scikit-learn `__ ML pipeline. -`Featuretools `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Featuretools `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. +`Compose `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Compose is a machine learning tool for labeling data and prediction engineering. It allows you to structure the labeling process by parameterizing prediction problems and transforming time-driven relational data into target values with cutoff times that can be used for supervised learning. + +`STUMPY `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +STUMPY is a powerful and scalable Python library for modern time series analysis. +At its core, STUMPY efficiently computes something called a +`matrix profile `__, +which can be used for a wide variety of time series data mining tasks. + .. _ecosystem.visualization: Visualization ------------- +`Pandas has its own Styler class for table visualization `_, and while +:ref:`pandas also has built-in support for data visualization through charts with matplotlib `, +there are a number of other pandas-compatible libraries. + `Altair `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -93,11 +112,11 @@ With Altair, you can spend more time understanding your data and its meaning. Altair's API is simple, friendly and consistent and built on top of the powerful Vega-Lite JSON specification. This elegant simplicity produces beautiful and effective visualizations with a -minimal amount of code. Altair works with Pandas DataFrames. +minimal amount of code. Altair works with pandas DataFrames. -`Bokeh `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Bokeh `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Bokeh is a Python interactive visualization library for large datasets that natively uses the latest web technologies. Its goal is to provide elegant, concise construction of novel @@ -105,7 +124,7 @@ graphics in the style of Protovis/D3, while delivering high-performance interact large data to thin clients. `Pandas-Bokeh `__ provides a high level API -for Bokeh that can be loaded as a native Pandas plotting backend via +for Bokeh that can be loaded as a native pandas plotting backend via .. code:: python @@ -128,7 +147,7 @@ estimation while plotting, aggregating across observations and visualizing the fit of statistical models to emphasize patterns in a dataset. `plotnine `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hadley Wickham's `ggplot2 `__ is a foundational exploratory visualization package for the R language. Based on `"The Grammar of Graphics" `__ it @@ -140,12 +159,27 @@ A good implementation for Python users is `has2k1/plotnine `__ leverages `Vega -`__ to create plots within Jupyter Notebook. +`__ to create plots within Jupyter Notebook. + +`Plotly `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +`Plotly’s `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `offline `__, or `on-premise `__ accounts for private use. + +`Lux `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -`Plotly `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Lux `__ is a Python library that facilitates fast and easy experimentation with data by automating the visual data exploration process. To use Lux, simply add an extra import alongside pandas: -`Plotly’s `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `cloud `__, `offline `__, or `on-premise `__ accounts for private use. +.. code:: python + + import lux + import pandas as pd + + df = pd.read_csv("data.csv") + df # discover interesting insights! + +By printing out a dataframe, Lux automatically `recommends a set of visualizations `__ that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a `powerful, intuitive language `__ that allow users to create `Altair `__, `matplotlib `__, or `Vega-Lite `__ visualizations without having to think at the level of code. `Qtpandas `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -165,22 +199,33 @@ invoked with the following command .. code:: python - import dtale; dtale.show(df) + import dtale + + dtale.show(df) -D-Tale integrates seamlessly with jupyter notebooks, python terminals, kaggle -& Google Colab. Here are some demos of the `grid `__ -and `chart-builder `__. +D-Tale integrates seamlessly with Jupyter notebooks, Python terminals, Kaggle +& Google Colab. Here are some demos of the `grid `__. + +`hvplot `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +hvPlot is a high-level plotting API for the PyData ecosystem built on `HoloViews `__. +It can be loaded as a native pandas plotting backend via + +.. code:: python + + pd.set_option("plotting.backend", "hvplot") .. _ecosystem.ide: IDE ------- +--- `IPython `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IPython is an interactive command shell and distributed computing -environment. IPython tab completion works with Pandas methods and also +environment. IPython tab completion works with pandas methods and also attributes like DataFrame columns. `Jupyter Notebook / Jupyter Lab `__ @@ -194,7 +239,7 @@ Jupyter notebooks can be converted to a number of open standard output formats Python) through 'Download As' in the web interface and ``jupyter convert`` in a shell. -Pandas DataFrames implement ``_repr_html_``and ``_repr_latex`` methods +pandas DataFrames implement ``_repr_html_`` and ``_repr_latex`` methods which are utilized by Jupyter Notebook for displaying (abbreviated) HTML or LaTeX tables. LaTeX output is properly escaped. (Note: HTML tables may or may not be @@ -218,19 +263,19 @@ debugging and profiling functionality of a software development tool with the data exploration, interactive execution, deep inspection and rich visualization capabilities of a scientific environment like MATLAB or Rstudio. -Its `Variable Explorer `__ +Its `Variable Explorer `__ allows users to view, manipulate and edit pandas ``Index``, ``Series``, and ``DataFrame`` objects like a "spreadsheet", including copying and modifying values, sorting, displaying a "heatmap", converting data types and more. -Pandas objects can also be renamed, duplicated, new columns added, -copyed/pasted to/from the clipboard (as TSV), and saved/loaded to/from a file. +pandas objects can also be renamed, duplicated, new columns added, +copied/pasted to/from the clipboard (as TSV), and saved/loaded to/from a file. Spyder can also import data from a variety of plain text and binary files or the clipboard into a new pandas DataFrame via a sophisticated import wizard. Most pandas classes, methods and data attributes can be autocompleted in -Spyder's `Editor `__ and -`IPython Console `__, -and Spyder's `Help pane `__ can retrieve +Spyder's `Editor `__ and +`IPython Console `__, +and Spyder's `Help pane `__ can retrieve and render Numpydoc documentation on pandas objects in rich text with Sphinx both automatically and on-demand. @@ -266,20 +311,20 @@ The following data feeds are available: * Stooq Index Data * MOEX Data -`Quandl/Python `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Quandl/Python `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Quandl API for Python wraps the Quandl REST API to return -Pandas DataFrames with timeseries indexes. +pandas DataFrames with timeseries indexes. `Pydatastream `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PyDatastream is a Python interface to the `Refinitiv Datastream (DWS) `__ -REST API to return indexed Pandas DataFrames with financial data. +REST API to return indexed pandas DataFrames with financial data. This package requires valid credentials for this API (non free). -`pandaSDMX `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`pandaSDMX `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pandaSDMX is a library to retrieve and acquire statistical data and metadata disseminated in `SDMX `_ 2.1, an ISO-standard @@ -298,19 +343,35 @@ HTTP API, and also provides several convenient methods for parsing and analyzing fredapi makes use of pandas and returns data in a Series or DataFrame. This module requires a FRED API key that you can obtain for free on the FRED website. +`dataframe_sql `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``dataframe_sql`` is a Python package that translates SQL syntax directly into +operations on pandas DataFrames. This is useful when migrating from a database to +using pandas or for users more comfortable with SQL looking for a way to interface +with pandas. + .. _ecosystem.domain: Domain specific --------------- -`Geopandas `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Geopandas `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Geopandas extends pandas data objects to include geographic information which support geometric operations. If your work entails maps and geographical coordinates, and you love pandas, you should take a close look at Geopandas. +`staircase `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +staircase is a data analysis package, built upon pandas and numpy, for modelling and +manipulation of mathematical step functions. It provides a rich variety of arithmetic +operations, relational operations, logical operations, statistical operations and +aggregations for step functions defined over real numbers, datetime and timedelta domains. + + `xarray `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -333,21 +394,58 @@ far exceeding the performance of the native ``df.to_sql`` method. Internally, it Microsoft's BCP utility, but the complexity is fully abstracted away from the end user. Rigorously tested, it is a complete replacement for ``df.to_sql``. +`Deltalake `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Deltalake python package lets you access tables stored in +`Delta Lake `__ natively in Python without the need to use Spark or +JVM. It provides the ``delta_table.to_pyarrow_table().to_pandas()`` method to convert +any Delta table into Pandas dataframe. + .. _ecosystem.out-of-core: Out-of-core -------------- +----------- `Blaze `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Blaze provides a standard API for doing computations with various -in-memory and on-disk backends: NumPy, Pandas, SQLAlchemy, MongoDB, PyTables, +in-memory and on-disk backends: NumPy, pandas, SQLAlchemy, MongoDB, PyTables, PySpark. -`Dask `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Cylon `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Cylon is a fast, scalable, distributed memory parallel runtime with a pandas +like Python DataFrame API. ”Core Cylon” is implemented with C++ using Apache +Arrow format to represent the data in-memory. Cylon DataFrame API implements +most of the core operators of pandas such as merge, filter, join, concat, +group-by, drop_duplicates, etc. These operators are designed to work across +thousands of cores to scale applications. It can interoperate with pandas +DataFrame by reading data from pandas or converting data to pandas so users +can selectively scale parts of their pandas DataFrame applications. + +.. code:: python + + from pycylon import read_csv, DataFrame, CylonEnv + from pycylon.net import MPIConfig + + # Initialize Cylon distributed environment + config: MPIConfig = MPIConfig() + env: CylonEnv = CylonEnv(config=config, distributed=True) + + df1: DataFrame = read_csv('/tmp/csv1.csv') + df2: DataFrame = read_csv('/tmp/csv2.csv') + + # Using 1000s of cores across the cluster to compute the join + df3: Table = df1.join(other=df2, on=[0], algorithm="hash", env=env) + + print(df3) + +`Dask `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Dask is a flexible parallel computing library for analytics. Dask provides a familiar ``DataFrame`` interface for out-of-core, parallel and distributed computing. @@ -357,13 +455,36 @@ provides a familiar ``DataFrame`` interface for out-of-core, parallel and distri Dask-ML enables parallel and distributed machine learning using Dask alongside existing machine learning libraries like Scikit-Learn, XGBoost, and TensorFlow. +`Ibis `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Ibis offers a standard way to write analytics code, that can be run in multiple engines. It helps in bridging the gap between local Python environments (like pandas) and remote storage and execution systems like Hadoop components (like HDFS, Impala, Hive, Spark) and SQL databases (Postgres, etc.). + + `Koalas `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Koalas provides a familiar pandas DataFrame interface on top of Apache Spark. It enables users to leverage multi-cores on one machine or a cluster of machines to speed up or scale their DataFrame code. -`Odo `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Modin `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``modin.pandas`` DataFrame is a parallel and distributed drop-in replacement +for pandas. This means that you can use Modin with existing pandas code or write +new code with the existing pandas API. Modin can leverage your entire machine or +cluster to speed up and scale your pandas workloads, including traditionally +time-consuming tasks like ingesting data (``read_csv``, ``read_excel``, +``read_parquet``, etc.). + +.. code:: python + + # import pandas as pd + import modin.pandas as pd + + df = pd.read_csv("big.csv") # use all your cores! + +`Odo `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Odo provides a uniform API for moving data between different formats. It uses pandas own ``read_csv`` for CSV IO and leverages many existing packages such as @@ -386,21 +507,11 @@ If also displays progress bars. # df.apply(func) df.parallel_apply(func) -`Ray `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Pandas on Ray is an early stage DataFrame library that wraps Pandas and transparently distributes the data and computation. The user does not need to know how many cores their system has, nor do they need to specify how to distribute the data. In fact, users can continue using their previous Pandas notebooks while experiencing a considerable speedup from Pandas on Ray, even on a single machine. Only a modification of the import statement is needed, as we demonstrate below. Once you’ve changed your import statement, you’re ready to use Pandas on Ray just like you would Pandas. -.. code:: python - - # import pandas as pd - import ray.dataframe as pd - - -`Vaex `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Vaex `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a python library for Out-of-Core DataFrames (similar to Pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted). +Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a Python library for Out-of-Core DataFrames (similar to pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted). * vaex.from_pandas * vaex.to_pandas_df @@ -410,7 +521,7 @@ Increasingly, packages are being built on top of pandas to address specific need Extension data types -------------------- -Pandas provides an interface for defining +pandas provides an interface for defining :ref:`extension types ` to extend NumPy's type system. The following libraries implement that interface to provide types not found in NumPy or pandas, which work well with pandas' data containers. @@ -421,14 +532,27 @@ found in NumPy or pandas, which work well with pandas' data containers. Cyberpandas provides an extension type for storing arrays of IP Addresses. These arrays can be stored inside pandas' Series and DataFrame. +`Pandas-Genomics`_ +~~~~~~~~~~~~~~~~~~ + +Pandas-Genomics provides extension types, extension arrays, and extension accessors for working with genomics data + `Pint-Pandas`_ ~~~~~~~~~~~~~~ -`Pint-Pandas ` provides an extension type for +`Pint-Pandas `_ provides an extension type for storing numeric arrays with units. These arrays can be stored inside pandas' Series and DataFrame. Operations between Series and DataFrame columns which use pint's extension array are then units aware. +`Text Extensions for Pandas`_ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +`Text Extensions for Pandas `_ +provides extension types to cover common data structures for representing natural language +data, plus library integrations that convert the outputs of popular natural language +processing libraries into Pandas DataFrames. + .. _ecosystem.accessors: Accessors @@ -438,18 +562,41 @@ A directory of projects providing :ref:`extension accessors `. This is for users to discover new accessors and for library authors to coordinate on the namespace. -=============== ========== ========================= =============================================================== -Library Accessor Classes Description -=============== ========== ========================= =============================================================== -`cyberpandas`_ ``ip`` ``Series`` Provides common operations for working with IP addresses. -`pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library. -`pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series. -`pint-pandas`_ ``pint`` ``Series``, ``DataFrame`` Provides units support for numeric Series and DataFrames. -=============== ========== ========================= =============================================================== +================== ============ ==================================== =============================================================================== +Library Accessor Classes Description +================== ============ ==================================== =============================================================================== +`cyberpandas`_ ``ip`` ``Series`` Provides common operations for working with IP addresses. +`pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library. +`pandas-genomics`_ ``genomics`` ``Series``, ``DataFrame`` Provides common operations for quality control and analysis of genomics data. +`pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series. +`pint-pandas`_ ``pint`` ``Series``, ``DataFrame`` Provides units support for numeric Series and DataFrames. +`composeml`_ ``slice`` ``DataFrame`` Provides a generator for enhanced data slicing. +`datatest`_ ``validate`` ``Series``, ``DataFrame``, ``Index`` Provides validation, differences, and acceptance managers. +`woodwork`_ ``ww`` ``Series``, ``DataFrame`` Provides physical, logical, and semantic data typing information for Series and DataFrames. +`staircase`_ ``sc`` ``Series`` Provides methods for querying, aggregating and plotting step functions +================== ============ ==================================== =============================================================================== .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest .. _pdvega: https://altair-viz.github.io/pdvega/ .. _Altair: https://altair-viz.github.io/ +.. _pandas-genomics: https://pandas-genomics.readthedocs.io/en/latest/ .. _pandas_path: https://github.com/drivendataorg/pandas-path/ .. _pathlib.Path: https://docs.python.org/3/library/pathlib.html .. _pint-pandas: https://github.com/hgrecco/pint-pandas +.. _composeml: https://github.com/alteryx/compose +.. _datatest: https://datatest.readthedocs.io/en/stable/ +.. _woodwork: https://github.com/alteryx/woodwork +.. _staircase: https://www.staircase.dev/ + +Development tools +----------------- + +`pandas-stubs `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +While pandas repository is partially typed, the package itself doesn't expose this information for external use. +Install pandas-stubs to enable basic type coverage of pandas API. + +Learn more by reading through :issue:`14468`, :issue:`26766`, :issue:`28142`. + +See installation and usage instructions on the `GitHub page `__. diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index e1a4cfe49b7d1..767779b0f58a8 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -5,11 +5,11 @@ Comparison with R / R libraries ******************************* -Since ``pandas`` aims to provide a lot of the data manipulation and analysis +Since pandas aims to provide a lot of the data manipulation and analysis functionality that people use `R `__ for, this page was started to provide a more detailed look at the `R language `__ and its many third -party libraries as they relate to ``pandas``. In comparisons with R and CRAN +party libraries as they relate to pandas. In comparisons with R and CRAN libraries, we care about the following things: * **Functionality / flexibility**: what can/cannot be done with each tool @@ -21,17 +21,13 @@ libraries, we care about the following things: This page is also here to offer a bit of a translation guide for users of these R packages. -For transfer of ``DataFrame`` objects from ``pandas`` to R, one option is to -use HDF5 files, see :ref:`io.external_compatibility` for an -example. - Quick reference --------------- We'll start off with a quick reference guide pairing some common R operations using `dplyr -`__ with +`__ with pandas equivalents. @@ -118,20 +114,20 @@ or by integer location df <- data.frame(matrix(rnorm(1000), ncol=100)) df[, c(1:10, 25:30, 40, 50:100)] -Selecting multiple columns by name in ``pandas`` is straightforward +Selecting multiple columns by name in pandas is straightforward .. ipython:: python - df = pd.DataFrame(np.random.randn(10, 3), columns=list('abc')) - df[['a', 'c']] - df.loc[:, ['a', 'c']] + df = pd.DataFrame(np.random.randn(10, 3), columns=list("abc")) + df[["a", "c"]] + df.loc[:, ["a", "c"]] Selecting multiple noncontiguous columns by integer location can be achieved with a combination of the ``iloc`` indexer attribute and ``numpy.r_``. .. ipython:: python - named = list('abcdefg') + named = list("abcdefg") n = 30 columns = named + np.arange(len(named), n).tolist() df = pd.DataFrame(np.random.randn(n, n), columns=columns) @@ -160,14 +156,29 @@ function. .. ipython:: python df = pd.DataFrame( - {'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], - 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], - 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], - 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan, - np.nan]}) + { + "v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], + "v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], + "by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], + "by2": [ + "wet", + "dry", + 99, + 95, + np.nan, + "damp", + 95, + 99, + "red", + 99, + np.nan, + np.nan, + ], + } + ) - g = df.groupby(['by1', 'by2']) - g[['v1', 'v2']].mean() + g = df.groupby(["by1", "by2"]) + g[["v1", "v2"]].mean() For more details and examples see :ref:`the groupby documentation `. @@ -220,7 +231,7 @@ since the subclass sizes are possibly irregular. Using a data.frame called tapply(baseball$batting.average, baseball.example$team, max) -In ``pandas`` we may use :meth:`~pandas.pivot_table` method to handle this: +In pandas we may use :meth:`~pandas.pivot_table` method to handle this: .. ipython:: python @@ -228,11 +239,14 @@ In ``pandas`` we may use :meth:`~pandas.pivot_table` method to handle this: import string baseball = pd.DataFrame( - {'team': ["team %d" % (x + 1) for x in range(5)] * 5, - 'player': random.sample(list(string.ascii_lowercase), 25), - 'batting avg': np.random.uniform(.200, .400, 25)}) + { + "team": ["team %d" % (x + 1) for x in range(5)] * 5, + "player": random.sample(list(string.ascii_lowercase), 25), + "batting avg": np.random.uniform(0.200, 0.400, 25), + } + ) - baseball.pivot_table(values='batting avg', columns='team', aggfunc=np.max) + baseball.pivot_table(values="batting avg", columns="team", aggfunc=np.max) For more details and examples see :ref:`the reshaping documentation `. @@ -250,16 +264,16 @@ column's values are less than another column's values: subset(df, a <= b) df[df$a <= df$b,] # note the comma -In ``pandas``, there are a few ways to perform subsetting. You can use +In pandas, there are a few ways to perform subsetting. You can use :meth:`~pandas.DataFrame.query` or pass an expression as if it were an index/slice as well as standard boolean indexing: .. ipython:: python - df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) - df.query('a <= b') - df[df['a'] <= df['b']] - df.loc[df['a'] <= df['b']] + df = pd.DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) + df.query("a <= b") + df[df["a"] <= df["b"]] + df.loc[df["a"] <= df["b"]] For more details and examples see :ref:`the query documentation `. @@ -277,14 +291,14 @@ An expression using a data.frame called ``df`` in R with the columns ``a`` and with(df, a + b) df$a + df$b # same as the previous expression -In ``pandas`` the equivalent expression, using the +In pandas the equivalent expression, using the :meth:`~pandas.DataFrame.eval` method, would be: .. ipython:: python - df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) - df.eval('a + b') - df['a'] + df['b'] # same as the previous expression + df = pd.DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) + df.eval("a + b") + df["a"] + df["b"] # same as the previous expression In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than evaluation in pure Python. For more details and examples see :ref:`the eval @@ -308,8 +322,8 @@ table below shows how these data structures could be mapped in Python. | data.frame | dataframe | +------------+-------------------------------+ -|ddply|_ -~~~~~~~~ +ddply +~~~~~ An expression using a data.frame called ``df`` in R where you want to summarize ``x`` by ``month``: @@ -329,19 +343,23 @@ summarize ``x`` by ``month``: mean = round(mean(x), 2), sd = round(sd(x), 2)) -In ``pandas`` the equivalent expression, using the +In pandas the equivalent expression, using the :meth:`~pandas.DataFrame.groupby` method, would be: .. ipython:: python - df = pd.DataFrame({'x': np.random.uniform(1., 168., 120), - 'y': np.random.uniform(7., 334., 120), - 'z': np.random.uniform(1.7, 20.7, 120), - 'month': [5, 6, 7, 8] * 30, - 'week': np.random.randint(1, 4, 120)}) + df = pd.DataFrame( + { + "x": np.random.uniform(1.0, 168.0, 120), + "y": np.random.uniform(7.0, 334.0, 120), + "z": np.random.uniform(1.7, 20.7, 120), + "month": [5, 6, 7, 8] * 30, + "week": np.random.randint(1, 4, 120), + } + ) - grouped = df.groupby(['month', 'week']) - grouped['x'].agg([np.mean, np.std]) + grouped = df.groupby(["month", "week"]) + grouped["x"].agg([np.mean, np.std]) For more details and examples see :ref:`the groupby documentation @@ -350,8 +368,8 @@ For more details and examples see :ref:`the groupby documentation reshape / reshape2 ------------------ -|meltarray|_ -~~~~~~~~~~~~~ +meltarray +~~~~~~~~~ An expression using a 3 dimensional array called ``a`` in R where you want to melt it into a data.frame: @@ -368,8 +386,8 @@ In Python, since ``a`` is a list, you can simply use list comprehension. a = np.array(list(range(1, 24)) + [np.NAN]).reshape(2, 3, 4) pd.DataFrame([tuple(list(x) + [val]) for x, val in np.ndenumerate(a)]) -|meltlist|_ -~~~~~~~~~~~~ +meltlist +~~~~~~~~ An expression using a list called ``a`` in R where you want to melt it into a data.frame: @@ -390,8 +408,8 @@ In Python, this list would be a list of tuples, so For more details and examples see :ref:`the Into to Data Structures documentation `. -|meltdf|_ -~~~~~~~~~~~~~~~~ +meltdf +~~~~~~ An expression using a data.frame called ``cheese`` in R where you want to reshape the data.frame: @@ -410,19 +428,23 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent: .. ipython:: python - cheese = pd.DataFrame({'first': ['John', 'Mary'], - 'last': ['Doe', 'Bo'], - 'height': [5.5, 6.0], - 'weight': [130, 150]}) + cheese = pd.DataFrame( + { + "first": ["John", "Mary"], + "last": ["Doe", "Bo"], + "height": [5.5, 6.0], + "weight": [130, 150], + } + ) - pd.melt(cheese, id_vars=['first', 'last']) - cheese.set_index(['first', 'last']).stack() # alternative way + pd.melt(cheese, id_vars=["first", "last"]) + cheese.set_index(["first", "last"]).stack() # alternative way For more details and examples see :ref:`the reshaping documentation `. -|cast|_ -~~~~~~~ +cast +~~~~ In R ``acast`` is an expression using a data.frame called ``df`` in R to cast into a higher dimensional array: @@ -444,15 +466,24 @@ In Python the best way is to make use of :meth:`~pandas.pivot_table`: .. ipython:: python - df = pd.DataFrame({'x': np.random.uniform(1., 168., 12), - 'y': np.random.uniform(7., 334., 12), - 'z': np.random.uniform(1.7, 20.7, 12), - 'month': [5, 6, 7] * 4, - 'week': [1, 2] * 6}) + df = pd.DataFrame( + { + "x": np.random.uniform(1.0, 168.0, 12), + "y": np.random.uniform(7.0, 334.0, 12), + "z": np.random.uniform(1.7, 20.7, 12), + "month": [5, 6, 7] * 4, + "week": [1, 2] * 6, + } + ) - mdf = pd.melt(df, id_vars=['month', 'week']) - pd.pivot_table(mdf, values='value', index=['variable', 'week'], - columns=['month'], aggfunc=np.mean) + mdf = pd.melt(df, id_vars=["month", "week"]) + pd.pivot_table( + mdf, + values="value", + index=["variable", "week"], + columns=["month"], + aggfunc=np.mean, + ) Similarly for ``dcast`` which uses a data.frame called ``df`` in R to aggregate information based on ``Animal`` and ``FeedType``: @@ -475,21 +506,29 @@ using :meth:`~pandas.pivot_table`: .. ipython:: python - df = pd.DataFrame({ - 'Animal': ['Animal1', 'Animal2', 'Animal3', 'Animal2', 'Animal1', - 'Animal2', 'Animal3'], - 'FeedType': ['A', 'B', 'A', 'A', 'B', 'B', 'A'], - 'Amount': [10, 7, 4, 2, 5, 6, 2], - }) + df = pd.DataFrame( + { + "Animal": [ + "Animal1", + "Animal2", + "Animal3", + "Animal2", + "Animal1", + "Animal2", + "Animal3", + ], + "FeedType": ["A", "B", "A", "A", "B", "B", "A"], + "Amount": [10, 7, 4, 2, 5, 6, 2], + } + ) - df.pivot_table(values='Amount', index='Animal', columns='FeedType', - aggfunc='sum') + df.pivot_table(values="Amount", index="Animal", columns="FeedType", aggfunc="sum") The second approach is to use the :meth:`~pandas.DataFrame.groupby` method: .. ipython:: python - df.groupby(['Animal', 'FeedType'])['Amount'].sum() + df.groupby(["Animal", "FeedType"])["Amount"].sum() For more details and examples see :ref:`the reshaping documentation ` or :ref:`the groupby documentation`. @@ -534,20 +573,5 @@ For more details and examples see :ref:`categorical introduction ` .. |subset| replace:: ``subset`` .. _subset: https://stat.ethz.ch/R-manual/R-patched/library/base/html/subset.html -.. |ddply| replace:: ``ddply`` -.. _ddply: https://cran.r-project.org/web/packages/plyr/plyr.pdf#Rfn.ddply.1 - -.. |meltarray| replace:: ``melt.array`` -.. _meltarray: https://cran.r-project.org/web/packages/reshape2/reshape2.pdf#Rfn.melt.array.1 - -.. |meltlist| replace:: ``melt.list`` -.. meltlist: https://cran.r-project.org/web/packages/reshape2/reshape2.pdf#Rfn.melt.list.1 - -.. |meltdf| replace:: ``melt.data.frame`` -.. meltdf: https://cran.r-project.org/web/packages/reshape2/reshape2.pdf#Rfn.melt.data.frame.1 - -.. |cast| replace:: ``cast`` -.. cast: https://cran.r-project.org/web/packages/reshape2/reshape2.pdf#Rfn.cast.1 - .. |factor| replace:: ``factor`` .. _factor: https://stat.ethz.ch/R-manual/R-devel/library/base/html/factor.html diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst index 85c6ea2c31969..595f3c85a9dc2 100644 --- a/doc/source/getting_started/comparison/comparison_with_sas.rst +++ b/doc/source/getting_started/comparison/comparison_with_sas.rst @@ -4,32 +4,13 @@ Comparison with SAS ******************** + For potential users coming from `SAS `__ this page is meant to demonstrate how different SAS operations would be performed in pandas. -If you're new to pandas, you might want to first read through :ref:`10 Minutes to pandas<10min>` -to familiarize yourself with the library. - -As is customary, we import pandas and NumPy as follows: - -.. ipython:: python - - import pandas as pd - import numpy as np - - -.. note:: +.. include:: includes/introduction.rst - Throughout this tutorial, the pandas ``DataFrame`` will be displayed by calling - ``df.head()``, which displays the first N (default 5) rows of the ``DataFrame``. - This is often used in interactive work (e.g. `Jupyter notebook - `_ or terminal) - the equivalent in SAS would be: - - .. code-block:: sas - - proc print data=df(obs=5); - run; Data structures --------------- @@ -48,14 +29,17 @@ General terminology translation ``NaN``, ``.`` -``DataFrame`` / ``Series`` -~~~~~~~~~~~~~~~~~~~~~~~~~~ +``DataFrame`` +~~~~~~~~~~~~~ A ``DataFrame`` in pandas is analogous to a SAS data set - a two-dimensional data source with labeled columns that can be of different types. As will be shown in this document, almost any operation that can be applied to a data set using SAS's ``DATA`` step, can also be accomplished in pandas. +``Series`` +~~~~~~~~~~ + A ``Series`` is the data structure that represents one column of a ``DataFrame``. SAS doesn't have a separate data structure for a single column, but in general, working with a ``Series`` is analogous to referencing a column @@ -78,6 +62,12 @@ see the :ref:`indexing documentation` for much more on how to use an ``Index`` effectively. +Copies vs. in place operations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. include:: includes/copies.rst + + Data input / output ------------------- @@ -99,23 +89,14 @@ specifying the column names. ; run; -A pandas ``DataFrame`` can be constructed in many different ways, -but for a small number of values, it is often convenient to specify it as -a Python dictionary, where the keys are the column names -and the values are the data. - -.. ipython:: python - - df = pd.DataFrame({'x': [1, 3, 5], 'y': [2, 4, 6]}) - df - +.. include:: includes/construct_dataframe.rst Reading external data ~~~~~~~~~~~~~~~~~~~~~ Like SAS, pandas provides utilities for reading in data from many formats. The ``tips`` dataset, found within the pandas -tests (`csv `_) +tests (`csv `_) will be used in many of the following examples. SAS provides ``PROC IMPORT`` to read csv data into a data set. @@ -130,10 +111,12 @@ The pandas method is :func:`read_csv`, which works similarly. .. ipython:: python - url = ('https://raw.github.com/pandas-dev/' - 'pandas/master/pandas/tests/io/data/csv/tips.csv') + url = ( + "https://raw.githubusercontent.com/pandas-dev/" + "pandas/main/pandas/tests/io/data/csv/tips.csv" + ) tips = pd.read_csv(url) - tips.head() + tips Like ``PROC IMPORT``, ``read_csv`` can take a number of parameters to specify @@ -142,15 +125,28 @@ and did not have column names, the pandas command would be: .. code-block:: python - tips = pd.read_csv('tips.csv', sep='\t', header=None) + tips = pd.read_csv("tips.csv", sep="\t", header=None) # alternatively, read_table is an alias to read_csv with tab delimiter - tips = pd.read_table('tips.csv', header=None) + tips = pd.read_table("tips.csv", header=None) In addition to text/csv, pandas supports a variety of other data formats such as Excel, HDF5, and SQL databases. These are all read via a ``pd.read_*`` function. See the :ref:`IO documentation` for more details. +Limiting output +~~~~~~~~~~~~~~~ + +.. include:: includes/limit.rst + +The equivalent in SAS would be: + +.. code-block:: sas + + proc print data=df(obs=5); + run; + + Exporting data ~~~~~~~~~~~~~~ @@ -166,7 +162,7 @@ and other data formats follow a similar api. .. code-block:: python - tips.to_csv('tips2.csv') + tips.to_csv("tips2.csv") Data operations @@ -186,20 +182,8 @@ be used on new or existing columns. new_bill = total_bill / 2; run; -pandas provides similar vectorized operations by -specifying the individual ``Series`` in the ``DataFrame``. -New columns can be assigned in the same way. - -.. ipython:: python +.. include:: includes/column_operations.rst - tips['total_bill'] = tips['total_bill'] - 2 - tips['new_bill'] = tips['total_bill'] / 2.0 - tips.head() - -.. ipython:: python - :suppress: - - tips = tips.drop('new_bill', axis=1) Filtering ~~~~~~~~~ @@ -221,12 +205,7 @@ or more columns. DATA step begins and can also be used in PROC statements */ run; -DataFrames can be filtered in multiple ways; the most intuitive of which is using -:ref:`boolean indexing ` - -.. ipython:: python - - tips[tips['total_bill'] > 10].head() +.. include:: includes/filtering.rst If/then logic ~~~~~~~~~~~~~ @@ -243,18 +222,7 @@ In SAS, if/then logic can be used to create new columns. else bucket = 'high'; run; -The same operation in pandas can be accomplished using -the ``where`` method from ``numpy``. - -.. ipython:: python - - tips['bucket'] = np.where(tips['total_bill'] < 10, 'low', 'high') - tips.head() - -.. ipython:: python - :suppress: - - tips = tips.drop('bucket', axis=1) +.. include:: includes/if_then.rst Date functionality ~~~~~~~~~~~~~~~~~~ @@ -282,24 +250,7 @@ functions pandas supports other Time Series features not available in Base SAS (such as resampling and custom offsets) - see the :ref:`timeseries documentation` for more details. -.. ipython:: python - - tips['date1'] = pd.Timestamp('2013-01-15') - tips['date2'] = pd.Timestamp('2015-02-15') - tips['date1_year'] = tips['date1'].dt.year - tips['date2_month'] = tips['date2'].dt.month - tips['date1_next'] = tips['date1'] + pd.offsets.MonthBegin() - tips['months_between'] = ( - tips['date2'].dt.to_period('M') - tips['date1'].dt.to_period('M')) - - tips[['date1', 'date2', 'date1_year', 'date2_month', - 'date1_next', 'months_between']].head() - -.. ipython:: python - :suppress: - - tips = tips.drop(['date1', 'date2', 'date1_year', - 'date2_month', 'date1_next', 'months_between'], axis=1) +.. include:: includes/time_date.rst Selection of columns ~~~~~~~~~~~~~~~~~~~~ @@ -324,18 +275,7 @@ drop, and rename columns. rename total_bill=total_bill_2; run; -The same operations are expressed in pandas below. - -.. ipython:: python - - # keep - tips[['sex', 'total_bill', 'tip']].head() - - # drop - tips.drop('sex', axis=1).head() - - # rename - tips.rename(columns={'total_bill': 'total_bill_2'}).head() +.. include:: includes/column_selection.rst Sorting by values @@ -349,20 +289,13 @@ Sorting in SAS is accomplished via ``PROC SORT`` by sex total_bill; run; -pandas objects have a :meth:`~DataFrame.sort_values` method, which -takes a list of columns to sort by. - -.. ipython:: python - - tips = tips.sort_values(['sex', 'total_bill']) - tips.head() - +.. include:: includes/sorting.rst String processing ----------------- -Length -~~~~~~ +Finding length of string +~~~~~~~~~~~~~~~~~~~~~~~~ SAS determines the length of a character string with the `LENGTHN `__ @@ -377,18 +310,11 @@ functions. ``LENGTHN`` excludes trailing blanks and ``LENGTHC`` includes trailin put(LENGTHC(time)); run; -Python determines the length of a character string with the ``len`` function. -``len`` includes trailing blanks. Use ``len`` and ``rstrip`` to exclude -trailing blanks. - -.. ipython:: python +.. include:: includes/length.rst - tips['time'].str.len().head() - tips['time'].str.rstrip().str.len().head() - -Find -~~~~ +Finding position of substring +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SAS determines the position of a character in a string with the `FINDW `__ function. @@ -402,22 +328,14 @@ you supply as the second argument. put(FINDW(sex,'ale')); run; -Python determines the position of a character in a string with the -``find`` function. ``find`` searches for the first position of the -substring. If the substring is found, the function returns its -position. Keep in mind that Python indexes are zero-based and -the function will return -1 if it fails to find the substring. - -.. ipython:: python - - tips['sex'].str.find("ale").head() +.. include:: includes/find_substring.rst -Substring -~~~~~~~~~ +Extracting substring by position +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SAS extracts a substring from a string based on its position with the -`SUBSTR `__ function. +`SUBSTR `__ function. .. code-block:: sas @@ -426,17 +344,11 @@ SAS extracts a substring from a string based on its position with the put(substr(sex,1,1)); run; -With pandas you can use ``[]`` notation to extract a substring -from a string by position locations. Keep in mind that Python -indexes are zero-based. - -.. ipython:: python - - tips['sex'].str[0:1].head() +.. include:: includes/extract_substring.rst -Scan -~~~~ +Extracting nth word +~~~~~~~~~~~~~~~~~~~ The SAS `SCAN `__ function returns the nth word from a string. The first argument is the string you want to parse and the @@ -454,20 +366,11 @@ second argument specifies which word you want to extract. ;;; run; -Python extracts a substring from a string based on its text -by using regular expressions. There are much more powerful -approaches, but this just shows a simple approach. - -.. ipython:: python - - firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']}) - firstlast['First_Name'] = firstlast['String'].str.split(" ", expand=True)[0] - firstlast['Last_Name'] = firstlast['String'].str.rsplit(" ", expand=True)[0] - firstlast +.. include:: includes/nth_word.rst -Upcase, lowcase, and propcase -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Changing case +~~~~~~~~~~~~~ The SAS `UPCASE `__ `LOWCASE `__ and @@ -487,29 +390,13 @@ functions change the case of the argument. ;;; run; -The equivalent Python functions are ``upper``, ``lower``, and ``title``. +.. include:: includes/case.rst -.. ipython:: python - - firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']}) - firstlast['string_up'] = firstlast['String'].str.upper() - firstlast['string_low'] = firstlast['String'].str.lower() - firstlast['string_prop'] = firstlast['String'].str.title() - firstlast Merging ------- -The following tables will be used in the merge examples - -.. ipython:: python - - df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], - 'value': np.random.randn(4)}) - df1 - df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], - 'value': np.random.randn(4)}) - df2 +.. include:: includes/merge_setup.rst In SAS, data must be explicitly sorted before merging. Different types of joins are accomplished using the ``in=`` dummy @@ -535,39 +422,15 @@ input frames. if a or b then output outer_join; run; -pandas DataFrames have a :meth:`~DataFrame.merge` method, which provides -similar functionality. Note that the data does not have -to be sorted ahead of time, and different join -types are accomplished via the ``how`` keyword. - -.. ipython:: python - - inner_join = df1.merge(df2, on=['key'], how='inner') - inner_join - - left_join = df1.merge(df2, on=['key'], how='left') - left_join - - right_join = df1.merge(df2, on=['key'], how='right') - right_join - - outer_join = df1.merge(df2, on=['key'], how='outer') - outer_join +.. include:: includes/merge.rst Missing data ------------ -Like SAS, pandas has a representation for missing data - which is the -special float value ``NaN`` (not a number). Many of the semantics -are the same, for example missing data propagates through numeric -operations, and is ignored by default for aggregations. - -.. ipython:: python +Both pandas and SAS have a representation for missing data. - outer_join - outer_join['value_x'] + outer_join['value_y'] - outer_join['value_x'].sum() +.. include:: includes/missing_intro.rst One difference is that missing data cannot be compared to its sentinel value. For example, in SAS you could do this to filter missing values. @@ -584,25 +447,7 @@ For example, in SAS you could do this to filter missing values. if value_x ^= .; run; -Which doesn't work in pandas. Instead, the ``pd.isna`` or ``pd.notna`` functions -should be used for comparisons. - -.. ipython:: python - - outer_join[pd.isna(outer_join['value_x'])] - outer_join[pd.notna(outer_join['value_x'])] - -pandas also provides a variety of methods to work with missing data - some of -which would be challenging to express in SAS. For example, there are methods to -drop all rows with any missing values, replacing missing values with a specified -value, like the mean, or forward filling from previous rows. See the -:ref:`missing data documentation` for more. - -.. ipython:: python - - outer_join.dropna() - outer_join.fillna(method='ffill') - outer_join['value_x'].fillna(outer_join['value_x'].mean()) +.. include:: includes/missing.rst GroupBy @@ -611,7 +456,7 @@ GroupBy Aggregation ~~~~~~~~~~~ -SAS's PROC SUMMARY can be used to group by one or +SAS's ``PROC SUMMARY`` can be used to group by one or more key variables and compute aggregations on numeric columns. @@ -623,14 +468,7 @@ numeric columns. output out=tips_summed sum=; run; -pandas provides a flexible ``groupby`` mechanism that -allows similar aggregations. See the :ref:`groupby documentation` -for more details and examples. - -.. ipython:: python - - tips_summed = tips.groupby(['sex', 'smoker'])[['total_bill', 'tip']].sum() - tips_summed.head() +.. include:: includes/groupby.rst Transformation @@ -659,16 +497,7 @@ example, to subtract the mean for each observation by smoker group. if a and b; run; - -pandas ``groupby`` provides a ``transform`` mechanism that allows -these type of operations to be succinctly expressed in one -operation. - -.. ipython:: python - - gb = tips.groupby('smoker')['total_bill'] - tips['adj_total_bill'] = tips['total_bill'] - gb.transform('mean') - tips.head() +.. include:: includes/transform.rst By group processing @@ -695,7 +524,7 @@ In pandas this would be written as: .. ipython:: python - tips.groupby(['sex', 'smoker']).first() + tips.groupby(["sex", "smoker"]).first() Other considerations @@ -709,7 +538,7 @@ This means that the size of data able to be loaded in pandas is limited by your machine's memory, but also that the operations on that data may be faster. If out of core processing is needed, one possibility is the -`dask.dataframe `_ +`dask.dataframe `_ library (currently in development) which provides a subset of pandas functionality for an on-disk ``DataFrame`` @@ -729,16 +558,16 @@ the XPORT or SAS7BDAT binary format. .. code-block:: python - df = pd.read_sas('transport-file.xpt') - df = pd.read_sas('binary-file.sas7bdat') + df = pd.read_sas("transport-file.xpt") + df = pd.read_sas("binary-file.sas7bdat") You can also specify the file format directly. By default, pandas will try to infer the file format based on its extension. .. code-block:: python - df = pd.read_sas('transport-file.xpt', format='xport') - df = pd.read_sas('binary-file.sas7bdat', format='sas7bdat') + df = pd.read_sas("transport-file.xpt", format="xport") + df = pd.read_sas("binary-file.sas7bdat", format="sas7bdat") XPORT is a relatively limited format and the parsing of it is not as optimized as some of the other pandas readers. An alternative way @@ -752,4 +581,4 @@ to interop data between SAS and pandas is to serialize to csv. Wall time: 14.6 s In [9]: %time df = pd.read_csv('big.csv') - Wall time: 4.86 s \ No newline at end of file + Wall time: 4.86 s diff --git a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst new file mode 100644 index 0000000000000..d55b669d94a87 --- /dev/null +++ b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst @@ -0,0 +1,465 @@ +.. _compare_with_spreadsheets: + +{{ header }} + +Comparison with spreadsheets +**************************** + +Since many potential pandas users have some familiarity with spreadsheet programs like +`Excel `_, this page is meant to provide some examples +of how various spreadsheet operations would be performed using pandas. This page will use +terminology and link to documentation for Excel, but much will be the same/similar in +`Google Sheets `_, +`LibreOffice Calc `_, +`Apple Numbers `_, and other +Excel-compatible spreadsheet software. + +.. include:: includes/introduction.rst + +Data structures +--------------- + +General terminology translation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. csv-table:: + :header: "pandas", "Excel" + :widths: 20, 20 + + ``DataFrame``, worksheet + ``Series``, column + ``Index``, row headings + row, row + ``NaN``, empty cell + +``DataFrame`` +~~~~~~~~~~~~~ + +A ``DataFrame`` in pandas is analogous to an Excel worksheet. While an Excel workbook can contain +multiple worksheets, pandas ``DataFrame``\s exist independently. + +``Series`` +~~~~~~~~~~ + +A ``Series`` is the data structure that represents one column of a ``DataFrame``. Working with a +``Series`` is analogous to referencing a column of a spreadsheet. + +``Index`` +~~~~~~~~~ + +Every ``DataFrame`` and ``Series`` has an ``Index``, which are labels on the *rows* of the data. In +pandas, if no index is specified, a :class:`~pandas.RangeIndex` is used by default (first row = 0, +second row = 1, and so on), analogous to row headings/numbers in spreadsheets. + +In pandas, indexes can be set to one (or multiple) unique values, which is like having a column that +is used as the row identifier in a worksheet. Unlike most spreadsheets, these ``Index`` values can +actually be used to reference the rows. (Note that `this can be done in Excel with structured +references +`_.) +For example, in spreadsheets, you would reference the first row as ``A1:Z1``, while in pandas you +could use ``populations.loc['Chicago']``. + +Index values are also persistent, so if you re-order the rows in a ``DataFrame``, the label for a +particular row don't change. + +See the :ref:`indexing documentation` for much more on how to use an ``Index`` +effectively. + + +Copies vs. in place operations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. include:: includes/copies.rst + + +Data input / output +------------------- + +Constructing a DataFrame from values +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In a spreadsheet, `values can be typed directly into cells `_. + +.. include:: includes/construct_dataframe.rst + +Reading external data +~~~~~~~~~~~~~~~~~~~~~ + +Both `Excel `__ +and :ref:`pandas <10min_tut_02_read_write>` can import data from various sources in various +formats. + +CSV +''' + +Let's load and display the `tips `_ +dataset from the pandas tests, which is a CSV file. In Excel, you would download and then +`open the CSV `_. +In pandas, you pass the URL or local path of the CSV file to :func:`~pandas.read_csv`: + +.. ipython:: python + + url = ( + "https://raw.githubusercontent.com/pandas-dev" + "/pandas/main/pandas/tests/io/data/csv/tips.csv" + ) + tips = pd.read_csv(url) + tips + +Like `Excel's Text Import Wizard `_, +``read_csv`` can take a number of parameters to specify how the data should be parsed. For +example, if the data was instead tab delimited, and did not have column names, the pandas command +would be: + +.. code-block:: python + + tips = pd.read_csv("tips.csv", sep="\t", header=None) + + # alternatively, read_table is an alias to read_csv with tab delimiter + tips = pd.read_table("tips.csv", header=None) + +Excel files +''''''''''' + +Excel opens `various Excel file formats `_ +by double-clicking them, or using `the Open menu `_. +In pandas, you use :ref:`special methods for reading and writing from/to Excel files `. + +Let's first :ref:`create a new Excel file ` based on the ``tips`` dataframe in the above example: + +.. code-block:: python + + tips.to_excel("./tips.xlsx") + +Should you wish to subsequently access the data in the ``tips.xlsx`` file, you can read it into your module using + +.. code-block:: python + + tips_df = pd.read_excel("./tips.xlsx", index_col=0) + +You have just read in an Excel file using pandas! + + +Limiting output +~~~~~~~~~~~~~~~ + +Spreadsheet programs will only show one screenful of data at a time and then allow you to scroll, so +there isn't really a need to limit output. In pandas, you'll need to put a little more thought into +controlling how your ``DataFrame``\s are displayed. + +.. include:: includes/limit.rst + + +Exporting data +~~~~~~~~~~~~~~ + +By default, desktop spreadsheet software will save to its respective file format (``.xlsx``, ``.ods``, etc). You can, however, `save to other file formats `_. + +:ref:`pandas can create Excel files `, :ref:`CSV `, or :ref:`a number of other formats `. + +Data operations +--------------- + +Operations on columns +~~~~~~~~~~~~~~~~~~~~~ + +In spreadsheets, `formulas +`_ +are often created in individual cells and then `dragged +`_ +into other cells to compute them for other columns. In pandas, you're able to do operations on whole +columns directly. + +.. include:: includes/column_operations.rst + +Note that we aren't having to tell it to do that subtraction cell-by-cell — pandas handles that for +us. See :ref:`how to create new columns derived from existing columns <10min_tut_05_columns>`. + + +Filtering +~~~~~~~~~ + +`In Excel, filtering is done through a graphical menu. `_ + +.. image:: ../../_static/spreadsheets/filter.png + :alt: Screenshot showing filtering of the total_bill column to values greater than 10 + :align: center + +.. include:: includes/filtering.rst + +If/then logic +~~~~~~~~~~~~~ + +Let's say we want to make a ``bucket`` column with values of ``low`` and ``high``, based on whether +the ``total_bill`` is less or more than $10. + +In spreadsheets, logical comparison can be done with `conditional formulas +`_. +We'd use a formula of ``=IF(A2 < 10, "low", "high")``, dragged to all cells in a new ``bucket`` +column. + +.. image:: ../../_static/spreadsheets/conditional.png + :alt: Screenshot showing the formula from above in a bucket column of the tips spreadsheet + :align: center + +.. include:: includes/if_then.rst + +Date functionality +~~~~~~~~~~~~~~~~~~ + +*This section will refer to "dates", but timestamps are handled similarly.* + +We can think of date functionality in two parts: parsing, and output. In spreadsheets, date values +are generally parsed automatically, though there is a `DATEVALUE +`_ +function if you need it. In pandas, you need to explicitly convert plain text to datetime objects, +either :ref:`while reading from a CSV ` or :ref:`once in a DataFrame +<10min_tut_09_timeseries.properties>`. + +Once parsed, spreadsheets display the dates in a default format, though `the format can be changed +`_. +In pandas, you'll generally want to keep dates as ``datetime`` objects while you're doing +calculations with them. Outputting *parts* of dates (such as the year) is done through `date +functions +`_ +in spreadsheets, and :ref:`datetime properties <10min_tut_09_timeseries.properties>` in pandas. + +Given ``date1`` and ``date2`` in columns ``A`` and ``B`` of a spreadsheet, you might have these +formulas: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - column + - formula + * - ``date1_year`` + - ``=YEAR(A2)`` + * - ``date2_month`` + - ``=MONTH(B2)`` + * - ``date1_next`` + - ``=DATE(YEAR(A2),MONTH(A2)+1,1)`` + * - ``months_between`` + - ``=DATEDIF(A2,B2,"M")`` + +The equivalent pandas operations are shown below. + +.. include:: includes/time_date.rst + +See :ref:`timeseries` for more details. + + +Selection of columns +~~~~~~~~~~~~~~~~~~~~ + +In spreadsheets, you can select columns you want by: + +- `Hiding columns `_ +- `Deleting columns `_ +- `Referencing a range `_ from one worksheet into another + +Since spreadsheet columns are typically `named in a header row +`_, +renaming a column is simply a matter of changing the text in that first cell. + +.. include:: includes/column_selection.rst + + +Sorting by values +~~~~~~~~~~~~~~~~~ + +Sorting in spreadsheets is accomplished via `the sort dialog `_. + +.. image:: ../../_static/spreadsheets/sort.png + :alt: Screenshot of dialog from Excel showing sorting by the sex then total_bill columns + :align: center + +.. include:: includes/sorting.rst + +String processing +----------------- + +Finding length of string +~~~~~~~~~~~~~~~~~~~~~~~~ + +In spreadsheets, the number of characters in text can be found with the `LEN +`_ +function. This can be used with the `TRIM +`_ +function to remove extra whitespace. + +:: + + =LEN(TRIM(A2)) + +.. include:: includes/length.rst + +Note this will still include multiple spaces within the string, so isn't 100% equivalent. + + +Finding position of substring +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The `FIND +`_ +spreadsheet function returns the position of a substring, with the first character being ``1``. + +.. image:: ../../_static/spreadsheets/sort.png + :alt: Screenshot of FIND formula being used in Excel + :align: center + +.. include:: includes/find_substring.rst + + +Extracting substring by position +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Spreadsheets have a `MID +`_ +formula for extracting a substring from a given position. To get the first character:: + + =MID(A2,1,1) + +.. include:: includes/extract_substring.rst + + +Extracting nth word +~~~~~~~~~~~~~~~~~~~ + +In Excel, you might use the `Text to Columns Wizard +`_ +for splitting text and retrieving a specific column. (Note `it's possible to do so through a formula +as well `_.) + +.. include:: includes/nth_word.rst + + +Changing case +~~~~~~~~~~~~~ + +Spreadsheets provide `UPPER, LOWER, and PROPER functions +`_ +for converting text to upper, lower, and title case, respectively. + +.. include:: includes/case.rst + + +Merging +------- + +.. include:: includes/merge_setup.rst + +In Excel, there are `merging of tables can be done through a VLOOKUP +`_. + +.. image:: ../../_static/spreadsheets/vlookup.png + :alt: Screenshot showing a VLOOKUP formula between two tables in Excel, with some values being filled in and others with "#N/A" + :align: center + +.. include:: includes/merge.rst + +``merge`` has a number of advantages over ``VLOOKUP``: + +* The lookup value doesn't need to be the first column of the lookup table +* If multiple rows are matched, there will be one row for each match, instead of just the first +* It will include all columns from the lookup table, instead of just a single specified column +* It supports :ref:`more complex join operations ` + + +Other considerations +-------------------- + +Fill Handle +~~~~~~~~~~~ + +Create a series of numbers following a set pattern in a certain set of cells. In +a spreadsheet, this would be done by shift+drag after entering the first number or by +entering the first two or three values and then dragging. + +This can be achieved by creating a series and assigning it to the desired cells. + +.. ipython:: python + + df = pd.DataFrame({"AAA": [1] * 8, "BBB": list(range(0, 8))}) + df + + series = list(range(1, 5)) + series + + df.loc[2:5, "AAA"] = series + + df + +Drop Duplicates +~~~~~~~~~~~~~~~ + +Excel has built-in functionality for `removing duplicate values `_. +This is supported in pandas via :meth:`~DataFrame.drop_duplicates`. + +.. ipython:: python + + df = pd.DataFrame( + { + "class": ["A", "A", "A", "B", "C", "D"], + "student_count": [42, 35, 42, 50, 47, 45], + "all_pass": ["Yes", "Yes", "Yes", "No", "No", "Yes"], + } + ) + + df.drop_duplicates() + + df.drop_duplicates(["class", "student_count"]) + +Pivot Tables +~~~~~~~~~~~~ + +`PivotTables `_ +from spreadsheets can be replicated in pandas through :ref:`reshaping`. Using the ``tips`` dataset again, +let's find the average gratuity by size of the party and sex of the server. + +In Excel, we use the following configuration for the PivotTable: + +.. image:: ../../_static/spreadsheets/pivot.png + :alt: Screenshot showing a PivotTable in Excel, using sex as the column, size as the rows, then average tip as the values + :align: center + +The equivalent in pandas: + +.. ipython:: python + + pd.pivot_table( + tips, values="tip", index=["size"], columns=["sex"], aggfunc=np.average + ) + + +Adding a row +~~~~~~~~~~~~ + +Assuming we are using a :class:`~pandas.RangeIndex` (numbered ``0``, ``1``, etc.), we can use :func:`concat` to add a row to the bottom of a ``DataFrame``. + +.. ipython:: python + + df + new_row = pd.DataFrame([["E", 51, True]], + columns=["class", "student_count", "all_pass"]) + pd.concat([df, new_row]) + + +Find and Replace +~~~~~~~~~~~~~~~~ + +`Excel's Find dialog `_ +takes you to cells that match, one by one. In pandas, this operation is generally done for an +entire column or ``DataFrame`` at once through :ref:`conditional expressions <10min_tut_03_subset.rows_and_columns>`. + +.. ipython:: python + + tips + tips == "Sun" + tips["day"].str.contains("S") + +pandas' :meth:`~DataFrame.replace` is comparable to Excel's ``Replace All``. + +.. ipython:: python + + tips.replace("Thu", "Thursday") diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index aa7218c3e4fad..a6d9d65e85645 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -8,26 +8,27 @@ Since many potential pandas users have some familiarity with `SQL `_, this page is meant to provide some examples of how various SQL operations would be performed using pandas. -If you're new to pandas, you might want to first read through :ref:`10 Minutes to pandas<10min>` -to familiarize yourself with the library. - -As is customary, we import pandas and NumPy as follows: - -.. ipython:: python - - import pandas as pd - import numpy as np +.. include:: includes/introduction.rst Most of the examples will utilize the ``tips`` dataset found within pandas tests. We'll read -the data into a DataFrame called `tips` and assume we have a database table of the same name and +the data into a DataFrame called ``tips`` and assume we have a database table of the same name and structure. .. ipython:: python - url = ('https://raw.github.com/pandas-dev' - '/pandas/master/pandas/tests/io/data/csv/tips.csv') + url = ( + "https://raw.githubusercontent.com/pandas-dev" + "/pandas/main/pandas/tests/io/data/csv/tips.csv" + ) tips = pd.read_csv(url) - tips.head() + tips + + +Copies vs. in place operations +------------------------------ + +.. include:: includes/copies.rst + SELECT ------ @@ -37,14 +38,13 @@ to select all columns): .. code-block:: sql SELECT total_bill, tip, smoker, time - FROM tips - LIMIT 5; + FROM tips; With pandas, column selection is done by passing a list of column names to your DataFrame: .. ipython:: python - tips[['total_bill', 'tip', 'smoker', 'time']].head(5) + tips[["total_bill", "tip", "smoker", "time"]] Calling the DataFrame without the list of column names would display all columns (akin to SQL's ``*``). @@ -54,14 +54,13 @@ In SQL, you can add a calculated column: .. code-block:: sql SELECT *, tip/total_bill as tip_rate - FROM tips - LIMIT 5; + FROM tips; With pandas, you can use the :meth:`DataFrame.assign` method of a DataFrame to append a new column: .. ipython:: python - tips.assign(tip_rate=tips['tip'] / tips['total_bill']).head(5) + tips.assign(tip_rate=tips["tip"] / tips["total_bill"]) WHERE ----- @@ -71,59 +70,45 @@ Filtering in SQL is done via a WHERE clause. SELECT * FROM tips - WHERE time = 'Dinner' - LIMIT 5; + WHERE time = 'Dinner'; -DataFrames can be filtered in multiple ways; the most intuitive of which is using -:ref:`boolean indexing ` +.. include:: includes/filtering.rst -.. ipython:: python - - tips[tips['time'] == 'Dinner'].head(5) - -The above statement is simply passing a ``Series`` of True/False objects to the DataFrame, -returning all rows with True. - -.. ipython:: python - - is_dinner = tips['time'] == 'Dinner' - is_dinner.value_counts() - tips[is_dinner].head(5) +Just like SQL's ``OR`` and ``AND``, multiple conditions can be passed to a DataFrame using ``|`` +(``OR``) and ``&`` (``AND``). -Just like SQL's OR and AND, multiple conditions can be passed to a DataFrame using | (OR) and & -(AND). +Tips of more than $5 at Dinner meals: .. code-block:: sql - -- tips of more than $5.00 at Dinner meals SELECT * FROM tips WHERE time = 'Dinner' AND tip > 5.00; .. ipython:: python - # tips of more than $5.00 at Dinner meals - tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)] + tips[(tips["time"] == "Dinner") & (tips["tip"] > 5.00)] + +Tips by parties of at least 5 diners OR bill total was more than $45: .. code-block:: sql - -- tips by parties of at least 5 diners OR bill total was more than $45 SELECT * FROM tips WHERE size >= 5 OR total_bill > 45; .. ipython:: python - # tips by parties of at least 5 diners OR bill total was more than $45 - tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)] + tips[(tips["size"] >= 5) | (tips["total_bill"] > 45)] NULL checking is done using the :meth:`~pandas.Series.notna` and :meth:`~pandas.Series.isna` methods. .. ipython:: python - frame = pd.DataFrame({'col1': ['A', 'B', np.NaN, 'C', 'D'], - 'col2': ['F', np.NaN, 'G', 'H', 'I']}) + frame = pd.DataFrame( + {"col1": ["A", "B", np.NaN, "C", "D"], "col2": ["F", np.NaN, "G", "H", "I"]} + ) frame Assume we have a table of the same structure as our DataFrame above. We can see only the records @@ -137,7 +122,7 @@ where ``col2`` IS NULL with the following query: .. ipython:: python - frame[frame['col2'].isna()] + frame[frame["col2"].isna()] Getting items where ``col1`` IS NOT NULL can be done with :meth:`~pandas.Series.notna`. @@ -149,12 +134,12 @@ Getting items where ``col1`` IS NOT NULL can be done with :meth:`~pandas.Series. .. ipython:: python - frame[frame['col1'].notna()] + frame[frame["col1"].notna()] GROUP BY -------- -In pandas, SQL's GROUP BY operations are performed using the similarly named +In pandas, SQL's ``GROUP BY`` operations are performed using the similarly named :meth:`~pandas.DataFrame.groupby` method. :meth:`~pandas.DataFrame.groupby` typically refers to a process where we'd like to split a dataset into groups, apply some function (typically aggregation) , and then combine the groups together. @@ -177,23 +162,23 @@ The pandas equivalent would be: .. ipython:: python - tips.groupby('sex').size() + tips.groupby("sex").size() Notice that in the pandas code we used :meth:`~pandas.core.groupby.DataFrameGroupBy.size` and not :meth:`~pandas.core.groupby.DataFrameGroupBy.count`. This is because :meth:`~pandas.core.groupby.DataFrameGroupBy.count` applies the function to each column, returning -the number of ``not null`` records within each. +the number of ``NOT NULL`` records within each. .. ipython:: python - tips.groupby('sex').count() + tips.groupby("sex").count() Alternatively, we could have applied the :meth:`~pandas.core.groupby.DataFrameGroupBy.count` method to an individual column: .. ipython:: python - tips.groupby('sex')['total_bill'].count() + tips.groupby("sex")["total_bill"].count() Multiple functions can also be applied at once. For instance, say we'd like to see how tip amount differs by day of the week - :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` allows you to pass a dictionary @@ -208,12 +193,12 @@ to your grouped DataFrame, indicating which functions to apply to specific colum Fri 2.734737 19 Sat 2.993103 87 Sun 3.255132 76 - Thur 2.771452 62 + Thu 2.771452 62 */ .. ipython:: python - tips.groupby('day').agg({'tip': np.mean, 'day': np.size}) + tips.groupby("day").agg({"tip": np.mean, "day": np.size}) Grouping by more than one column is done by passing a list of columns to the :meth:`~pandas.DataFrame.groupby` method. @@ -228,36 +213,40 @@ Grouping by more than one column is done by passing a list of columns to the No Fri 4 2.812500 Sat 45 3.102889 Sun 57 3.167895 - Thur 45 2.673778 + Thu 45 2.673778 Yes Fri 15 2.714000 Sat 42 2.875476 Sun 19 3.516842 - Thur 17 3.030000 + Thu 17 3.030000 */ .. ipython:: python - tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]}) + tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]}) .. _compare_with_sql.join: JOIN ---- -JOINs can be performed with :meth:`~pandas.DataFrame.join` or :meth:`~pandas.merge`. By default, -:meth:`~pandas.DataFrame.join` will join the DataFrames on their indices. Each method has -parameters allowing you to specify the type of join to perform (LEFT, RIGHT, INNER, FULL) or the -columns to join on (column names or indices). +``JOIN``\s can be performed with :meth:`~pandas.DataFrame.join` or :meth:`~pandas.merge`. By +default, :meth:`~pandas.DataFrame.join` will join the DataFrames on their indices. Each method has +parameters allowing you to specify the type of join to perform (``LEFT``, ``RIGHT``, ``INNER``, +``FULL``) or the columns to join on (column names or indices). + +.. warning:: + + If both key columns contain rows where the key is a null value, those + rows will be matched against each other. This is different from usual SQL + join behaviour and can lead to unexpected results. .. ipython:: python - df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], - 'value': np.random.randn(4)}) - df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], - 'value': np.random.randn(4)}) + df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)}) + df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)}) Assume we have two database tables of the same name and structure as our DataFrames. -Now let's go over the various types of JOINs. +Now let's go over the various types of ``JOIN``\s. INNER JOIN ~~~~~~~~~~ @@ -271,21 +260,23 @@ INNER JOIN .. ipython:: python # merge performs an INNER JOIN by default - pd.merge(df1, df2, on='key') + pd.merge(df1, df2, on="key") :meth:`~pandas.merge` also offers parameters for cases when you'd like to join one DataFrame's column with another DataFrame's index. .. ipython:: python - indexed_df2 = df2.set_index('key') - pd.merge(df1, indexed_df2, left_on='key', right_index=True) + indexed_df2 = df2.set_index("key") + pd.merge(df1, indexed_df2, left_on="key", right_index=True) LEFT OUTER JOIN ~~~~~~~~~~~~~~~ + +Show all records from ``df1``. + .. code-block:: sql - -- show all records from df1 SELECT * FROM df1 LEFT OUTER JOIN df2 @@ -293,14 +284,15 @@ LEFT OUTER JOIN .. ipython:: python - # show all records from df1 - pd.merge(df1, df2, on='key', how='left') + pd.merge(df1, df2, on="key", how="left") RIGHT JOIN ~~~~~~~~~~ + +Show all records from ``df2``. + .. code-block:: sql - -- show all records from df2 SELECT * FROM df1 RIGHT OUTER JOIN df2 @@ -308,17 +300,17 @@ RIGHT JOIN .. ipython:: python - # show all records from df2 - pd.merge(df1, df2, on='key', how='right') + pd.merge(df1, df2, on="key", how="right") FULL JOIN ~~~~~~~~~ -pandas also allows for FULL JOINs, which display both sides of the dataset, whether or not the -joined columns find a match. As of writing, FULL JOINs are not supported in all RDBMS (MySQL). +pandas also allows for ``FULL JOIN``\s, which display both sides of the dataset, whether or not the +joined columns find a match. As of writing, ``FULL JOIN``\s are not supported in all RDBMS (MySQL). + +Show all records from both tables. .. code-block:: sql - -- show all records from both tables SELECT * FROM df1 FULL OUTER JOIN df2 @@ -326,20 +318,22 @@ joined columns find a match. As of writing, FULL JOINs are not supported in all .. ipython:: python - # show all records from both frames - pd.merge(df1, df2, on='key', how='outer') + pd.merge(df1, df2, on="key", how="outer") UNION ----- -UNION ALL can be performed using :meth:`~pandas.concat`. + +``UNION ALL`` can be performed using :meth:`~pandas.concat`. .. ipython:: python - df1 = pd.DataFrame({'city': ['Chicago', 'San Francisco', 'New York City'], - 'rank': range(1, 4)}) - df2 = pd.DataFrame({'city': ['Chicago', 'Boston', 'Los Angeles'], - 'rank': [1, 4, 5]}) + df1 = pd.DataFrame( + {"city": ["Chicago", "San Francisco", "New York City"], "rank": range(1, 4)} + ) + df2 = pd.DataFrame( + {"city": ["Chicago", "Boston", "Los Angeles"], "rank": [1, 4, 5]} + ) .. code-block:: sql @@ -362,7 +356,7 @@ UNION ALL can be performed using :meth:`~pandas.concat`. pd.concat([df1, df2]) -SQL's UNION is similar to UNION ALL, however UNION will remove duplicate rows. +SQL's ``UNION`` is similar to ``UNION ALL``, however ``UNION`` will remove duplicate rows. .. code-block:: sql @@ -388,6 +382,20 @@ In pandas, you can use :meth:`~pandas.concat` in conjunction with pd.concat([df1, df2]).drop_duplicates() + +LIMIT +----- + +.. code-block:: sql + + SELECT * FROM tips + LIMIT 10; + +.. ipython:: python + + tips.head(10) + + pandas equivalents for some SQL analytic and aggregate functions ---------------------------------------------------------------- @@ -403,7 +411,7 @@ Top n rows with offset .. ipython:: python - tips.nlargest(10 + 5, columns='tip').tail(10) + tips.nlargest(10 + 5, columns="tip").tail(10) Top n rows per group ~~~~~~~~~~~~~~~~~~~~ @@ -423,20 +431,30 @@ Top n rows per group .. ipython:: python - (tips.assign(rn=tips.sort_values(['total_bill'], ascending=False) - .groupby(['day']) - .cumcount() + 1) - .query('rn < 3') - .sort_values(['day', 'rn'])) + ( + tips.assign( + rn=tips.sort_values(["total_bill"], ascending=False) + .groupby(["day"]) + .cumcount() + + 1 + ) + .query("rn < 3") + .sort_values(["day", "rn"]) + ) -the same using `rank(method='first')` function +the same using ``rank(method='first')`` function .. ipython:: python - (tips.assign(rnk=tips.groupby(['day'])['total_bill'] - .rank(method='first', ascending=False)) - .query('rnk < 3') - .sort_values(['day', 'rnk'])) + ( + tips.assign( + rnk=tips.groupby(["day"])["total_bill"].rank( + method="first", ascending=False + ) + ) + .query("rnk < 3") + .sort_values(["day", "rnk"]) + ) .. code-block:: sql @@ -453,16 +471,17 @@ the same using `rank(method='first')` function Let's find tips with (rank < 3) per gender group for (tips < 2). Notice that when using ``rank(method='min')`` function -`rnk_min` remains the same for the same `tip` -(as Oracle's RANK() function) +``rnk_min`` remains the same for the same ``tip`` +(as Oracle's ``RANK()`` function) .. ipython:: python - (tips[tips['tip'] < 2] - .assign(rnk_min=tips.groupby(['sex'])['tip'] - .rank(method='min')) - .query('rnk_min < 3') - .sort_values(['sex', 'rnk_min'])) + ( + tips[tips["tip"] < 2] + .assign(rnk_min=tips.groupby(["sex"])["tip"].rank(method="min")) + .query("rnk_min < 3") + .sort_values(["sex", "rnk_min"]) + ) UPDATE @@ -476,7 +495,7 @@ UPDATE .. ipython:: python - tips.loc[tips['tip'] < 2, 'tip'] *= 2 + tips.loc[tips["tip"] < 2, "tip"] *= 2 DELETE ------ @@ -486,8 +505,8 @@ DELETE DELETE FROM tips WHERE tip > 9; -In pandas we select the rows that should remain, instead of deleting them +In pandas we select the rows that should remain instead of deleting them: .. ipython:: python - tips = tips.loc[tips['tip'] <= 9] + tips = tips.loc[tips["tip"] <= 9] diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst index 06f9e45466243..b4b0c42d1db1d 100644 --- a/doc/source/getting_started/comparison/comparison_with_stata.rst +++ b/doc/source/getting_started/comparison/comparison_with_stata.rst @@ -8,28 +8,8 @@ For potential users coming from `Stata `__ this page is meant to demonstrate how different Stata operations would be performed in pandas. -If you're new to pandas, you might want to first read through :ref:`10 Minutes to pandas<10min>` -to familiarize yourself with the library. +.. include:: includes/introduction.rst -As is customary, we import pandas and NumPy as follows. This means that we can refer to the -libraries as ``pd`` and ``np``, respectively, for the rest of the document. - -.. ipython:: python - - import pandas as pd - import numpy as np - - -.. note:: - - Throughout this tutorial, the pandas ``DataFrame`` will be displayed by calling - ``df.head()``, which displays the first N (default 5) rows of the ``DataFrame``. - This is often used in interactive work (e.g. `Jupyter notebook - `_ or terminal) -- the equivalent in Stata would be: - - .. code-block:: stata - - list in 1/5 Data structures --------------- @@ -48,14 +28,17 @@ General terminology translation ``NaN``, ``.`` -``DataFrame`` / ``Series`` -~~~~~~~~~~~~~~~~~~~~~~~~~~ +``DataFrame`` +~~~~~~~~~~~~~ A ``DataFrame`` in pandas is analogous to a Stata data set -- a two-dimensional data source with labeled columns that can be of different types. As will be shown in this document, almost any operation that can be applied to a data set in Stata can also be accomplished in pandas. +``Series`` +~~~~~~~~~~ + A ``Series`` is the data structure that represents one column of a ``DataFrame``. Stata doesn't have a separate data structure for a single column, but in general, working with a ``Series`` is analogous to referencing a column @@ -78,6 +61,12 @@ see the :ref:`indexing documentation` for much more on how to use an ``Index`` effectively. +Copies vs. in place operations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. include:: includes/copies.rst + + Data input / output ------------------- @@ -96,23 +85,14 @@ specifying the column names. 5 6 end -A pandas ``DataFrame`` can be constructed in many different ways, -but for a small number of values, it is often convenient to specify it as -a Python dictionary, where the keys are the column names -and the values are the data. - -.. ipython:: python - - df = pd.DataFrame({'x': [1, 3, 5], 'y': [2, 4, 6]}) - df - +.. include:: includes/construct_dataframe.rst Reading external data ~~~~~~~~~~~~~~~~~~~~~ Like Stata, pandas provides utilities for reading in data from many formats. The ``tips`` data set, found within the pandas -tests (`csv `_) +tests (`csv `_) will be used in many of the following examples. Stata provides ``import delimited`` to read csv data into a data set in memory. @@ -127,10 +107,12 @@ the data set if presented with a url. .. ipython:: python - url = ('https://raw.github.com/pandas-dev' - '/pandas/master/pandas/tests/io/data/csv/tips.csv') + url = ( + "https://raw.githubusercontent.com/pandas-dev" + "/pandas/main/pandas/tests/io/data/csv/tips.csv" + ) tips = pd.read_csv(url) - tips.head() + tips Like ``import delimited``, :func:`read_csv` can take a number of parameters to specify how the data should be parsed. For example, if the data were instead tab delimited, @@ -139,22 +121,34 @@ the pandas command would be: .. code-block:: python - tips = pd.read_csv('tips.csv', sep='\t', header=None) + tips = pd.read_csv("tips.csv", sep="\t", header=None) # alternatively, read_table is an alias to read_csv with tab delimiter - tips = pd.read_table('tips.csv', header=None) + tips = pd.read_table("tips.csv", header=None) -Pandas can also read Stata data sets in ``.dta`` format with the :func:`read_stata` function. +pandas can also read Stata data sets in ``.dta`` format with the :func:`read_stata` function. .. code-block:: python - df = pd.read_stata('data.dta') + df = pd.read_stata("data.dta") In addition to text/csv and Stata files, pandas supports a variety of other data formats such as Excel, SAS, HDF5, Parquet, and SQL databases. These are all read via a ``pd.read_*`` function. See the :ref:`IO documentation` for more details. +Limiting output +~~~~~~~~~~~~~~~ + +.. include:: includes/limit.rst + +The equivalent in Stata would be: + +.. code-block:: stata + + list in 1/5 + + Exporting data ~~~~~~~~~~~~~~ @@ -168,13 +162,13 @@ Similarly in pandas, the opposite of ``read_csv`` is :meth:`DataFrame.to_csv`. .. code-block:: python - tips.to_csv('tips2.csv') + tips.to_csv("tips2.csv") -Pandas can also export to Stata file format with the :meth:`DataFrame.to_stata` method. +pandas can also export to Stata file format with the :meth:`DataFrame.to_stata` method. .. code-block:: python - tips.to_stata('tips2.dta') + tips.to_stata("tips2.dta") Data operations @@ -193,18 +187,8 @@ the column from the data set. generate new_bill = total_bill / 2 drop new_bill -pandas provides similar vectorized operations by -specifying the individual ``Series`` in the ``DataFrame``. -New columns can be assigned in the same way. The :meth:`DataFrame.drop` method -drops a column from the ``DataFrame``. - -.. ipython:: python - - tips['total_bill'] = tips['total_bill'] - 2 - tips['new_bill'] = tips['total_bill'] / 2 - tips.head() +.. include:: includes/column_operations.rst - tips = tips.drop('new_bill', axis=1) Filtering ~~~~~~~~~ @@ -215,12 +199,7 @@ Filtering in Stata is done with an ``if`` clause on one or more columns. list if total_bill > 10 -DataFrames can be filtered in multiple ways; the most intuitive of which is using -:ref:`boolean indexing `. - -.. ipython:: python - - tips[tips['total_bill'] > 10].head() +.. include:: includes/filtering.rst If/then logic ~~~~~~~~~~~~~ @@ -232,18 +211,7 @@ In Stata, an ``if`` clause can also be used to create new columns. generate bucket = "low" if total_bill < 10 replace bucket = "high" if total_bill >= 10 -The same operation in pandas can be accomplished using -the ``where`` method from ``numpy``. - -.. ipython:: python - - tips['bucket'] = np.where(tips['total_bill'] < 10, 'low', 'high') - tips.head() - -.. ipython:: python - :suppress: - - tips = tips.drop('bucket', axis=1) +.. include:: includes/if_then.rst Date functionality ~~~~~~~~~~~~~~~~~~ @@ -271,24 +239,7 @@ functions, pandas supports other Time Series features not available in Stata (such as time zone handling and custom offsets) -- see the :ref:`timeseries documentation` for more details. -.. ipython:: python - - tips['date1'] = pd.Timestamp('2013-01-15') - tips['date2'] = pd.Timestamp('2015-02-15') - tips['date1_year'] = tips['date1'].dt.year - tips['date2_month'] = tips['date2'].dt.month - tips['date1_next'] = tips['date1'] + pd.offsets.MonthBegin() - tips['months_between'] = (tips['date2'].dt.to_period('M') - - tips['date1'].dt.to_period('M')) - - tips[['date1', 'date2', 'date1_year', 'date2_month', 'date1_next', - 'months_between']].head() - -.. ipython:: python - :suppress: - - tips = tips.drop(['date1', 'date2', 'date1_year', 'date2_month', - 'date1_next', 'months_between'], axis=1) +.. include:: includes/time_date.rst Selection of columns ~~~~~~~~~~~~~~~~~~~~ @@ -303,20 +254,7 @@ Stata provides keywords to select, drop, and rename columns. rename total_bill total_bill_2 -The same operations are expressed in pandas below. Note that in contrast to Stata, these -operations do not happen in place. To make these changes persist, assign the operation back -to a variable. - -.. ipython:: python - - # keep - tips[['sex', 'total_bill', 'tip']].head() - - # drop - tips.drop('sex', axis=1).head() - - # rename - tips.rename(columns={'total_bill': 'total_bill_2'}).head() +.. include:: includes/column_selection.rst Sorting by values @@ -328,14 +266,7 @@ Sorting in Stata is accomplished via ``sort`` sort sex total_bill -pandas objects have a :meth:`DataFrame.sort_values` method, which -takes a list of columns to sort by. - -.. ipython:: python - - tips = tips.sort_values(['sex', 'total_bill']) - tips.head() - +.. include:: includes/sorting.rst String processing ----------------- @@ -351,14 +282,7 @@ Stata determines the length of a character string with the :func:`strlen` and generate strlen_time = strlen(time) generate ustrlen_time = ustrlen(time) -Python determines the length of a character string with the ``len`` function. -In Python 3, all strings are Unicode strings. ``len`` includes trailing blanks. -Use ``len`` and ``rstrip`` to exclude trailing blanks. - -.. ipython:: python - - tips['time'].str.len().head() - tips['time'].str.rstrip().str.len().head() +.. include:: includes/length.rst Finding position of substring @@ -372,15 +296,7 @@ first position of the substring you supply as the second argument. generate str_position = strpos(sex, "ale") -Python determines the position of a character in a string with the -:func:`find` function. ``find`` searches for the first position of the -substring. If the substring is found, the function returns its -position. Keep in mind that Python indexes are zero-based and -the function will return -1 if it fails to find the substring. - -.. ipython:: python - - tips['sex'].str.find("ale").head() +.. include:: includes/find_substring.rst Extracting substring by position @@ -392,13 +308,7 @@ Stata extracts a substring from a string based on its position with the :func:`s generate short_sex = substr(sex, 1, 1) -With pandas you can use ``[]`` notation to extract a substring -from a string by position locations. Keep in mind that Python -indexes are zero-based. - -.. ipython:: python - - tips['sex'].str[0:1].head() +.. include:: includes/extract_substring.rst Extracting nth word @@ -419,16 +329,7 @@ second argument specifies which word you want to extract. generate first_name = word(name, 1) generate last_name = word(name, -1) -Python extracts a substring from a string based on its text -by using regular expressions. There are much more powerful -approaches, but this just shows a simple approach. - -.. ipython:: python - - firstlast = pd.DataFrame({'string': ['John Smith', 'Jane Cook']}) - firstlast['First_Name'] = firstlast['string'].str.split(" ", expand=True)[0] - firstlast['Last_Name'] = firstlast['string'].str.rsplit(" ", expand=True)[0] - firstlast +.. include:: includes/nth_word.rst Changing case @@ -451,29 +352,13 @@ change the case of ASCII and Unicode strings, respectively. generate title = strproper(string) list -The equivalent Python functions are ``upper``, ``lower``, and ``title``. +.. include:: includes/case.rst -.. ipython:: python - - firstlast = pd.DataFrame({'string': ['John Smith', 'Jane Cook']}) - firstlast['upper'] = firstlast['string'].str.upper() - firstlast['lower'] = firstlast['string'].str.lower() - firstlast['title'] = firstlast['string'].str.title() - firstlast Merging ------- -The following tables will be used in the merge examples - -.. ipython:: python - - df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], - 'value': np.random.randn(4)}) - df1 - df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], - 'value': np.random.randn(4)}) - df2 +.. include:: includes/merge_setup.rst In Stata, to perform a merge, one data set must be in memory and the other must be referenced as a file name on disk. In @@ -528,38 +413,15 @@ or the intersection of the two by using the values created in the restore merge 1:n key using df2.dta -pandas DataFrames have a :meth:`DataFrame.merge` method, which provides -similar functionality. Note that different join -types are accomplished via the ``how`` keyword. - -.. ipython:: python - - inner_join = df1.merge(df2, on=['key'], how='inner') - inner_join - - left_join = df1.merge(df2, on=['key'], how='left') - left_join - - right_join = df1.merge(df2, on=['key'], how='right') - right_join - - outer_join = df1.merge(df2, on=['key'], how='outer') - outer_join +.. include:: includes/merge.rst Missing data ------------ -Like Stata, pandas has a representation for missing data -- the -special float value ``NaN`` (not a number). Many of the semantics -are the same; for example missing data propagates through numeric -operations, and is ignored by default for aggregations. - -.. ipython:: python +Both pandas and Stata have a representation for missing data. - outer_join - outer_join['value_x'] + outer_join['value_y'] - outer_join['value_x'].sum() +.. include:: includes/missing_intro.rst One difference is that missing data cannot be compared to its sentinel value. For example, in Stata you could do this to filter missing values. @@ -571,30 +433,7 @@ For example, in Stata you could do this to filter missing values. * Keep non-missing values list if value_x != . -This doesn't work in pandas. Instead, the :func:`pd.isna` or :func:`pd.notna` functions -should be used for comparisons. - -.. ipython:: python - - outer_join[pd.isna(outer_join['value_x'])] - outer_join[pd.notna(outer_join['value_x'])] - -Pandas also provides a variety of methods to work with missing data -- some of -which would be challenging to express in Stata. For example, there are methods to -drop all rows with any missing values, replacing missing values with a specified -value, like the mean, or forward filling from previous rows. See the -:ref:`missing data documentation` for more. - -.. ipython:: python - - # Drop rows with any missing value - outer_join.dropna() - - # Fill forwards - outer_join.fillna(method='ffill') - - # Impute missing values with the mean - outer_join['value_x'].fillna(outer_join['value_x'].mean()) +.. include:: includes/missing.rst GroupBy @@ -611,14 +450,7 @@ numeric columns. collapse (sum) total_bill tip, by(sex smoker) -pandas provides a flexible ``groupby`` mechanism that -allows similar aggregations. See the :ref:`groupby documentation` -for more details and examples. - -.. ipython:: python - - tips_summed = tips.groupby(['sex', 'smoker'])[['total_bill', 'tip']].sum() - tips_summed.head() +.. include:: includes/groupby.rst Transformation @@ -633,16 +465,7 @@ For example, to subtract the mean for each observation by smoker group. bysort sex smoker: egen group_bill = mean(total_bill) generate adj_total_bill = total_bill - group_bill - -pandas ``groupby`` provides a ``transform`` mechanism that allows -these type of operations to be succinctly expressed in one -operation. - -.. ipython:: python - - gb = tips.groupby('smoker')['total_bill'] - tips['adj_total_bill'] = tips['total_bill'] - gb.transform('mean') - tips.head() +.. include:: includes/transform.rst By group processing @@ -661,7 +484,7 @@ In pandas this would be written as: .. ipython:: python - tips.groupby(['sex', 'smoker']).first() + tips.groupby(["sex", "smoker"]).first() Other considerations @@ -670,9 +493,9 @@ Other considerations Disk vs memory ~~~~~~~~~~~~~~ -Pandas and Stata both operate exclusively in memory. This means that the size of +pandas and Stata both operate exclusively in memory. This means that the size of data able to be loaded in pandas is limited by your machine's memory. If out of core processing is needed, one possibility is the -`dask.dataframe `_ +`dask.dataframe `_ library, which provides a subset of pandas functionality for an on-disk ``DataFrame``. diff --git a/doc/source/getting_started/comparison/includes/case.rst b/doc/source/getting_started/comparison/includes/case.rst new file mode 100644 index 0000000000000..c00a830bc8511 --- /dev/null +++ b/doc/source/getting_started/comparison/includes/case.rst @@ -0,0 +1,10 @@ +The equivalent pandas methods are :meth:`Series.str.upper`, :meth:`Series.str.lower`, and +:meth:`Series.str.title`. + +.. ipython:: python + + firstlast = pd.DataFrame({"string": ["John Smith", "Jane Cook"]}) + firstlast["upper"] = firstlast["string"].str.upper() + firstlast["lower"] = firstlast["string"].str.lower() + firstlast["title"] = firstlast["string"].str.title() + firstlast diff --git a/doc/source/getting_started/comparison/includes/column_operations.rst b/doc/source/getting_started/comparison/includes/column_operations.rst new file mode 100644 index 0000000000000..b23b931ed2db1 --- /dev/null +++ b/doc/source/getting_started/comparison/includes/column_operations.rst @@ -0,0 +1,11 @@ +pandas provides vectorized operations by specifying the individual ``Series`` in the +``DataFrame``. New columns can be assigned in the same way. The :meth:`DataFrame.drop` method drops +a column from the ``DataFrame``. + +.. ipython:: python + + tips["total_bill"] = tips["total_bill"] - 2 + tips["new_bill"] = tips["total_bill"] / 2 + tips + + tips = tips.drop("new_bill", axis=1) diff --git a/doc/source/getting_started/comparison/includes/column_selection.rst b/doc/source/getting_started/comparison/includes/column_selection.rst new file mode 100644 index 0000000000000..071645c9718cb --- /dev/null +++ b/doc/source/getting_started/comparison/includes/column_selection.rst @@ -0,0 +1,22 @@ +The same operations are expressed in pandas below. + +Keep certain columns +'''''''''''''''''''' + +.. ipython:: python + + tips[["sex", "total_bill", "tip"]] + +Drop a column +''''''''''''' + +.. ipython:: python + + tips.drop("sex", axis=1) + +Rename a column +''''''''''''''' + +.. ipython:: python + + tips.rename(columns={"total_bill": "total_bill_2"}) diff --git a/doc/source/getting_started/comparison/includes/construct_dataframe.rst b/doc/source/getting_started/comparison/includes/construct_dataframe.rst new file mode 100644 index 0000000000000..4d066c7962d98 --- /dev/null +++ b/doc/source/getting_started/comparison/includes/construct_dataframe.rst @@ -0,0 +1,9 @@ +A pandas ``DataFrame`` can be constructed in many different ways, +but for a small number of values, it is often convenient to specify it as +a Python dictionary, where the keys are the column names +and the values are the data. + +.. ipython:: python + + df = pd.DataFrame({"x": [1, 3, 5], "y": [2, 4, 6]}) + df diff --git a/doc/source/getting_started/comparison/includes/copies.rst b/doc/source/getting_started/comparison/includes/copies.rst new file mode 100644 index 0000000000000..4f49c3a1a762e --- /dev/null +++ b/doc/source/getting_started/comparison/includes/copies.rst @@ -0,0 +1,28 @@ +Most pandas operations return copies of the ``Series``/``DataFrame``. To make the changes "stick", +you'll need to either assign to a new variable: + + .. code-block:: python + + sorted_df = df.sort_values("col1") + + +or overwrite the original one: + + .. code-block:: python + + df = df.sort_values("col1") + +.. note:: + + You will see an ``inplace=True`` or ``copy=False`` keyword argument available for + some methods: + + .. code-block:: python + + df.replace(5, inplace=True) + + There is an active discussion about deprecating and removing ``inplace`` and ``copy`` for + most methods (e.g. ``dropna``) except for a very small subset of methods + (including ``replace``). Both keywords won't be + necessary anymore in the context of Copy-on-Write. The proposal can be found + `here `_. diff --git a/doc/source/getting_started/comparison/includes/extract_substring.rst b/doc/source/getting_started/comparison/includes/extract_substring.rst new file mode 100644 index 0000000000000..1ba0dfac2317a --- /dev/null +++ b/doc/source/getting_started/comparison/includes/extract_substring.rst @@ -0,0 +1,7 @@ +With pandas you can use ``[]`` notation to extract a substring +from a string by position locations. Keep in mind that Python +indexes are zero-based. + +.. ipython:: python + + tips["sex"].str[0:1] diff --git a/doc/source/getting_started/comparison/includes/filtering.rst b/doc/source/getting_started/comparison/includes/filtering.rst new file mode 100644 index 0000000000000..8ddf7c0d2fa39 --- /dev/null +++ b/doc/source/getting_started/comparison/includes/filtering.rst @@ -0,0 +1,16 @@ +DataFrames can be filtered in multiple ways; the most intuitive of which is using +:ref:`boolean indexing `. + +.. ipython:: python + + tips[tips["total_bill"] > 10] + +The above statement is simply passing a ``Series`` of ``True``/``False`` objects to the DataFrame, +returning all rows with ``True``. + +.. ipython:: python + + is_dinner = tips["time"] == "Dinner" + is_dinner + is_dinner.value_counts() + tips[is_dinner] diff --git a/doc/source/getting_started/comparison/includes/find_substring.rst b/doc/source/getting_started/comparison/includes/find_substring.rst new file mode 100644 index 0000000000000..42543d05a0014 --- /dev/null +++ b/doc/source/getting_started/comparison/includes/find_substring.rst @@ -0,0 +1,8 @@ +You can find the position of a character in a column of strings with the :meth:`Series.str.find` +method. ``find`` searches for the first position of the substring. If the substring is found, the +method returns its position. If not found, it returns ``-1``. Keep in mind that Python indexes are +zero-based. + +.. ipython:: python + + tips["sex"].str.find("ale") diff --git a/doc/source/getting_started/comparison/includes/groupby.rst b/doc/source/getting_started/comparison/includes/groupby.rst new file mode 100644 index 0000000000000..93d5d51e3fb00 --- /dev/null +++ b/doc/source/getting_started/comparison/includes/groupby.rst @@ -0,0 +1,7 @@ +pandas provides a flexible ``groupby`` mechanism that allows similar aggregations. See the +:ref:`groupby documentation` for more details and examples. + +.. ipython:: python + + tips_summed = tips.groupby(["sex", "smoker"])[["total_bill", "tip"]].sum() + tips_summed diff --git a/doc/source/getting_started/comparison/includes/if_then.rst b/doc/source/getting_started/comparison/includes/if_then.rst new file mode 100644 index 0000000000000..f94e7588827f5 --- /dev/null +++ b/doc/source/getting_started/comparison/includes/if_then.rst @@ -0,0 +1,12 @@ +The same operation in pandas can be accomplished using +the ``where`` method from ``numpy``. + +.. ipython:: python + + tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high") + tips + +.. ipython:: python + :suppress: + + tips = tips.drop("bucket", axis=1) diff --git a/doc/source/getting_started/comparison/includes/introduction.rst b/doc/source/getting_started/comparison/includes/introduction.rst new file mode 100644 index 0000000000000..aedf2875dc452 --- /dev/null +++ b/doc/source/getting_started/comparison/includes/introduction.rst @@ -0,0 +1,9 @@ +If you're new to pandas, you might want to first read through :ref:`10 Minutes to pandas<10min>` +to familiarize yourself with the library. + +As is customary, we import pandas and NumPy as follows: + +.. ipython:: python + + import pandas as pd + import numpy as np diff --git a/doc/source/getting_started/comparison/includes/length.rst b/doc/source/getting_started/comparison/includes/length.rst new file mode 100644 index 0000000000000..9141fd4ea582a --- /dev/null +++ b/doc/source/getting_started/comparison/includes/length.rst @@ -0,0 +1,8 @@ +You can find the length of a character string with :meth:`Series.str.len`. +In Python 3, all strings are Unicode strings. ``len`` includes trailing blanks. +Use ``len`` and ``rstrip`` to exclude trailing blanks. + +.. ipython:: python + + tips["time"].str.len() + tips["time"].str.rstrip().str.len() diff --git a/doc/source/getting_started/comparison/includes/limit.rst b/doc/source/getting_started/comparison/includes/limit.rst new file mode 100644 index 0000000000000..4efeb4e43d07c --- /dev/null +++ b/doc/source/getting_started/comparison/includes/limit.rst @@ -0,0 +1,7 @@ +By default, pandas will truncate output of large ``DataFrame``\s to show the first and last rows. +This can be overridden by :ref:`changing the pandas options `, or using +:meth:`DataFrame.head` or :meth:`DataFrame.tail`. + +.. ipython:: python + + tips.head(5) diff --git a/doc/source/getting_started/comparison/includes/merge.rst b/doc/source/getting_started/comparison/includes/merge.rst new file mode 100644 index 0000000000000..b8e3f54fd132b --- /dev/null +++ b/doc/source/getting_started/comparison/includes/merge.rst @@ -0,0 +1,17 @@ +pandas DataFrames have a :meth:`~DataFrame.merge` method, which provides similar functionality. The +data does not have to be sorted ahead of time, and different join types are accomplished via the +``how`` keyword. + +.. ipython:: python + + inner_join = df1.merge(df2, on=["key"], how="inner") + inner_join + + left_join = df1.merge(df2, on=["key"], how="left") + left_join + + right_join = df1.merge(df2, on=["key"], how="right") + right_join + + outer_join = df1.merge(df2, on=["key"], how="outer") + outer_join diff --git a/doc/source/getting_started/comparison/includes/merge_setup.rst b/doc/source/getting_started/comparison/includes/merge_setup.rst new file mode 100644 index 0000000000000..f115cd58f7a94 --- /dev/null +++ b/doc/source/getting_started/comparison/includes/merge_setup.rst @@ -0,0 +1,8 @@ +The following tables will be used in the merge examples: + +.. ipython:: python + + df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)}) + df1 + df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)}) + df2 diff --git a/doc/source/getting_started/comparison/includes/missing.rst b/doc/source/getting_started/comparison/includes/missing.rst new file mode 100644 index 0000000000000..341c7d5498d82 --- /dev/null +++ b/doc/source/getting_started/comparison/includes/missing.rst @@ -0,0 +1,31 @@ +In pandas, :meth:`Series.isna` and :meth:`Series.notna` can be used to filter the rows. + +.. ipython:: python + + outer_join[outer_join["value_x"].isna()] + outer_join[outer_join["value_x"].notna()] + +pandas provides :ref:`a variety of methods to work with missing data `. Here are some examples: + +Drop rows with missing values +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + + outer_join.dropna() + +Forward fill from previous rows +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + + outer_join.fillna(method="ffill") + +Replace missing values with a specified value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Using the mean: + +.. ipython:: python + + outer_join["value_x"].fillna(outer_join["value_x"].mean()) diff --git a/doc/source/getting_started/comparison/includes/missing_intro.rst b/doc/source/getting_started/comparison/includes/missing_intro.rst new file mode 100644 index 0000000000000..366aa43d1264c --- /dev/null +++ b/doc/source/getting_started/comparison/includes/missing_intro.rst @@ -0,0 +1,9 @@ +pandas represents missing data with the special float value ``NaN`` (not a number). Many of the +semantics are the same; for example missing data propagates through numeric operations, and is +ignored by default for aggregations. + +.. ipython:: python + + outer_join + outer_join["value_x"] + outer_join["value_y"] + outer_join["value_x"].sum() diff --git a/doc/source/getting_started/comparison/includes/nth_word.rst b/doc/source/getting_started/comparison/includes/nth_word.rst new file mode 100644 index 0000000000000..20e2ec47a8c9d --- /dev/null +++ b/doc/source/getting_started/comparison/includes/nth_word.rst @@ -0,0 +1,9 @@ +The simplest way to extract words in pandas is to split the strings by spaces, then reference the +word by index. Note there are more powerful approaches should you need them. + +.. ipython:: python + + firstlast = pd.DataFrame({"String": ["John Smith", "Jane Cook"]}) + firstlast["First_Name"] = firstlast["String"].str.split(" ", expand=True)[0] + firstlast["Last_Name"] = firstlast["String"].str.rsplit(" ", expand=True)[1] + firstlast diff --git a/doc/source/getting_started/comparison/includes/sorting.rst b/doc/source/getting_started/comparison/includes/sorting.rst new file mode 100644 index 0000000000000..4e2e40a18adbd --- /dev/null +++ b/doc/source/getting_started/comparison/includes/sorting.rst @@ -0,0 +1,6 @@ +pandas has a :meth:`DataFrame.sort_values` method, which takes a list of columns to sort by. + +.. ipython:: python + + tips = tips.sort_values(["sex", "total_bill"]) + tips diff --git a/doc/source/getting_started/comparison/includes/time_date.rst b/doc/source/getting_started/comparison/includes/time_date.rst new file mode 100644 index 0000000000000..fb9ee2e216cd7 --- /dev/null +++ b/doc/source/getting_started/comparison/includes/time_date.rst @@ -0,0 +1,22 @@ +.. ipython:: python + + tips["date1"] = pd.Timestamp("2013-01-15") + tips["date2"] = pd.Timestamp("2015-02-15") + tips["date1_year"] = tips["date1"].dt.year + tips["date2_month"] = tips["date2"].dt.month + tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin() + tips["months_between"] = tips["date2"].dt.to_period("M") - tips[ + "date1" + ].dt.to_period("M") + + tips[ + ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"] + ] + +.. ipython:: python + :suppress: + + tips = tips.drop( + ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"], + axis=1, + ) diff --git a/doc/source/getting_started/comparison/includes/transform.rst b/doc/source/getting_started/comparison/includes/transform.rst new file mode 100644 index 0000000000000..b7599471432ad --- /dev/null +++ b/doc/source/getting_started/comparison/includes/transform.rst @@ -0,0 +1,8 @@ +pandas provides a :ref:`groupby.transform` mechanism that allows these type of operations to be +succinctly expressed in one operation. + +.. ipython:: python + + gb = tips.groupby("smoker")["total_bill"] + tips["adj_total_bill"] = tips["total_bill"] - gb.transform("mean") + tips diff --git a/doc/source/getting_started/comparison/index.rst b/doc/source/getting_started/comparison/index.rst index 998706ce0c639..c3f58ce1f3d6d 100644 --- a/doc/source/getting_started/comparison/index.rst +++ b/doc/source/getting_started/comparison/index.rst @@ -11,5 +11,6 @@ Comparison with other tools comparison_with_r comparison_with_sql + comparison_with_spreadsheets comparison_with_sas comparison_with_stata diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index eb7ee000a9a86..3bfd2c3b0219a 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -9,82 +9,49 @@ Getting started Installation ------------ -.. raw:: html +.. panels:: + :card: + install-card + :column: col-lg-6 col-md-6 col-sm-12 col-xs-12 p-3 -
-
-
-
-
- Working with conda? -
-
-

+ Working with conda? + ^^^^^^^^^^^^^^^^^^^ -pandas is part of the `Anaconda `__ distribution and can be -installed with Anaconda or Miniconda: + pandas is part of the `Anaconda `__ + distribution and can be installed with Anaconda or Miniconda: -.. raw:: html + ++++++++++++++++++++++ -

-
- -
-
-
-
-
- Prefer pip? -
-
-

+ pandas can be installed via pip from `PyPI `__. -pandas can be installed via pip from `PyPI `__. + ++++ -.. raw:: html + .. code-block:: bash -

-
- -
-
-
-
-
- In-depth instructions? -
-
-

Installing a specific version? - Installing from source? - Check the advanced installation page.

- -.. container:: custom-button - - :ref:`Learn more ` + .. link-button:: ./install.html + :type: url + :text: Learn more + :classes: btn-secondary stretched-link -.. raw:: html - -
-
-
-
-
.. _gentle_intro: @@ -116,7 +83,7 @@ Intro to pandas
When working with tabular data, such as data stored in spreadsheets or databases, pandas is the right tool for you. pandas will help you -to explore, clean and process your data. In pandas, a data table is called a :class:`DataFrame`. +to explore, clean, and process your data. In pandas, a data table is called a :class:`DataFrame`. .. image:: ../_static/schemas/01_table_dataframe.svg :align: center @@ -351,7 +318,7 @@ Adding a column to a :class:`DataFrame` based on existing data in other columns
Basic statistics (mean, median, min, max, counts...) are easily calculable. These or custom aggregations can be applied on the entire -data set, a sliding window of the data or grouped by categories. The latter is also known as the split-apply-combine approach. +data set, a sliding window of the data, or grouped by categories. The latter is also known as the split-apply-combine approach. .. image:: ../_static/schemas/06_groupby.svg :align: center @@ -444,7 +411,7 @@ from long to wide format. With aggregations built-in, a pivot table is created w
-Multiple tables can be concatenated both column wise as row wise and database-like join/merge operations are provided to combine multiple tables of data. +Multiple tables can be concatenated both column wise and row wise as database-like join/merge operations are provided to combine multiple tables of data. .. image:: ../_static/schemas/08_concat_row.svg :align: center @@ -533,7 +500,7 @@ pandas has great support for time series and has an extensive set of tools for w
-Data sets do not only contain numerical data. pandas provides a wide range of functions to cleaning textual data and extract useful information from it. +Data sets do not only contain numerical data. pandas provides a wide range of functions to clean textual data and extract useful information from it. .. raw:: html @@ -547,7 +514,7 @@ Data sets do not only contain numerical data. pandas provides a wide range of fu -:ref:`To user guide ` +:ref:`To user guide ` .. raw:: html @@ -569,81 +536,91 @@ Coming from... Are you familiar with other software for manipulating tablular data? Learn the pandas-equivalent operations compared to software you already know: -.. raw:: html +.. panels:: + :img-top-cls: dark-light + :card: + comparison-card text-center shadow + :column: col-lg-6 col-md-6 col-sm-6 col-xs-12 d-flex -
-
-
-
- R project logo -
-

The R programming language provides the dataframe data structure and multiple packages, - such as tidyverse use and extend data.frames for convenient data handling - functionalities similar to pandas.

+ --- + :card: + comparison-card-r + :img-top: ../_static/logo_r.svg -.. container:: custom-button + The `R programming language `__ provides the + ``data.frame`` data structure and multiple packages, such as + `tidyverse `__ use and extend ``data.frame`` + for convenient data handling functionalities similar to pandas. - :ref:`Learn more ` + +++ -.. raw:: html + .. link-button:: compare_with_r + :type: ref + :text: Learn more + :classes: btn-secondary stretched-link -
-
-
-
-
- SQL logo -
-

Already familiar to SELECT, GROUP BY, JOIN, etc.? - Most of these SQL manipulations do have equivalents in pandas.

-.. container:: custom-button + --- + :card: + comparison-card-sql + :img-top: ../_static/logo_sql.svg - :ref:`Learn more ` + Already familiar to ``SELECT``, ``GROUP BY``, ``JOIN``, etc.? + Most of these SQL manipulations do have equivalents in pandas. -.. raw:: html + +++ -
-
-
-
-
- STATA logo -
-

The data set included in the - STATA statistical software suite corresponds - to the pandas dataframe. Many of the operations known from STATA have an equivalent - in pandas.

+ .. link-button:: compare_with_sql + :type: ref + :text: Learn more + :classes: btn-secondary stretched-link -.. container:: custom-button - :ref:`Learn more ` + --- + :card: + comparison-card-stata + :img-top: ../_static/logo_stata.svg -.. raw:: html + The ``data set`` included in the `STATA `__ + statistical software suite corresponds to the pandas ``DataFrame``. + Many of the operations known from STATA have an equivalent in pandas. -
-
-
-
-
- SAS logo -
-

The SAS statistical software suite - also provides the data set corresponding to the pandas dataframe. - Also SAS vectorized operations, filtering, string processing operations, and more have similar - functions in pandas.

+ +++ -.. container:: custom-button + .. link-button:: compare_with_stata + :type: ref + :text: Learn more + :classes: btn-secondary stretched-link - :ref:`Learn more ` -.. raw:: html + --- + :card: + comparison-card-excel + :img-top: ../_static/spreadsheets/logo_excel.svg + + Users of `Excel `__ + or other spreadsheet programs will find that many of the concepts are + transferrable to pandas. + + +++ + + .. link-button:: compare_with_spreadsheets + :type: ref + :text: Learn more + :classes: btn-secondary stretched-link + + + --- + :card: + comparison-card-sas + :img-top: ../_static/logo_sas.svg + + The `SAS `__ statistical software suite + also provides the ``data set`` corresponding to the pandas ``DataFrame``. + Also SAS vectorized operations, filtering, string processing operations, + and more have similar functions in pandas. + + +++ + + .. link-button:: compare_with_sas + :type: ref + :text: Learn more + :classes: btn-secondary stretched-link -
-
-
-
-
Tutorials --------- diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index b79a9cd872c47..8a2d5ad418c40 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -12,13 +12,15 @@ cross platform distribution for data analysis and scientific computing. This is the recommended installation method for most users. Instructions for installing from source, -`PyPI `__, `ActivePython `__, various Linux distributions, or a +`PyPI `__, `ActivePython `__, various Linux distributions, or a `development version `__ are also provided. +.. _install.version: + Python version support ---------------------- -Officially Python 3.6.1 and above, 3.7, and 3.8. +Officially Python 3.8, 3.9, 3.10 and 3.11. Installing pandas ----------------- @@ -28,24 +30,24 @@ Installing pandas Installing with Anaconda ~~~~~~~~~~~~~~~~~~~~~~~~ -Installing pandas and the rest of the `NumPy `__ and -`SciPy `__ stack can be a little +Installing pandas and the rest of the `NumPy `__ and +`SciPy `__ stack can be a little difficult for inexperienced users. The simplest way to install not only pandas, but Python and the most popular -packages that make up the `SciPy `__ stack -(`IPython `__, `NumPy `__, +packages that make up the `SciPy `__ stack +(`IPython `__, `NumPy `__, `Matplotlib `__, ...) is with `Anaconda `__, a cross-platform -(Linux, Mac OS X, Windows) Python distribution for data analytics and +(Linux, macOS, Windows) Python distribution for data analytics and scientific computing. After running the installer, the user will have access to pandas and the -rest of the `SciPy `__ stack without needing to install +rest of the `SciPy `__ stack without needing to install anything else, and without needing to wait for any software to be compiled. Installation instructions for `Anaconda `__ -`can be found here `__. +`can be found here `__. A full list of the packages available as part of the `Anaconda `__ distribution @@ -68,18 +70,18 @@ and involves downloading the installer which is a few hundred megabytes in size. If you want to have more control on which packages, or have a limited internet bandwidth, then installing pandas with -`Miniconda `__ may be a better solution. +`Miniconda `__ may be a better solution. -`Conda `__ is the package manager that the +`Conda `__ is the package manager that the `Anaconda `__ distribution is built upon. It is a package manager that is both cross-platform and language agnostic (it can play a similar role to a pip and virtualenv combination). `Miniconda `__ allows you to create a minimal self contained Python installation, and then use the -`Conda `__ command to install additional packages. +`Conda `__ command to install additional packages. -First you will need `Conda `__ to be installed and +First you will need `Conda `__ to be installed and downloading and running the `Miniconda `__ will do this for you. The installer @@ -130,16 +132,29 @@ Installing from PyPI pandas can be installed via pip from `PyPI `__. +.. note:: + You must have ``pip>=19.3`` to install from PyPI. + :: pip install pandas +pandas can also be installed with sets of optional dependencies to enable certain functionality. For example, +to install pandas with the optional dependencies to read Excel files. + +:: + + pip install "pandas[excel]" + + +The full list of extras that can be installed can be found in the :ref:`dependency section.` + Installing with ActivePython ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Installation instructions for -`ActivePython `__ can be found -`here `__. Versions +`ActivePython `__ can be found +`here `__. Versions 2.7, 3.5 and 3.6 include pandas. Installing using your Linux distribution's package manager. @@ -153,10 +168,10 @@ The commands in this table will install pandas for Python 3 from your distributi Debian, stable, `official Debian repository `__ , ``sudo apt-get install python3-pandas`` - Debian & Ubuntu, unstable (latest packages), `NeuroDebian `__ , ``sudo apt-get install python3-pandas`` + Debian & Ubuntu, unstable (latest packages), `NeuroDebian `__ , ``sudo apt-get install python3-pandas`` Ubuntu, stable, `official Ubuntu repository `__ , ``sudo apt-get install python3-pandas`` OpenSuse, stable, `OpenSuse Repository `__ , ``zypper in python3-pandas`` - Fedora, stable, `official Fedora repository `__ , ``dnf install python3-pandas`` + Fedora, stable, `official Fedora repository `__ , ``dnf install python3-pandas`` Centos/RHEL, stable, `EPEL repository `__ , ``yum install python3-pandas`` **However**, the packages in the linux package managers are often a few versions behind, so @@ -179,12 +194,28 @@ In Linux/Mac you can run ``which python`` on your terminal and it will tell you using. If it's something like "/usr/bin/python", you're using the Python from the system, which is not recommended. It is highly recommended to use ``conda``, for quick installation and for package and dependency updates. -You can find simple installation instructions for pandas in this document: `installation instructions `. +You can find simple installation instructions for pandas in this document: ``installation instructions ``. Installing from source ~~~~~~~~~~~~~~~~~~~~~~ -See the :ref:`contributing guide ` for complete instructions on building from the git source tree. Further, see :ref:`creating a development environment ` if you wish to create a *pandas* development environment. +See the :ref:`contributing guide ` for complete instructions on building from the git source tree. Further, see :ref:`creating a development environment ` if you wish to create a pandas development environment. + +Installing the development version of pandas +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Installing a nightly build is the quickest way to: + +* Try a new feature that will be shipped in the next release (that is, a feature from a pull-request that was recently merged to the main branch). +* Check whether a bug you encountered has been fixed since the last release. + +You can install the nightly build of pandas using the scipy-wheels-nightly index from the PyPI registry of anaconda.org with the following command:: + + pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple pandas + +Note that first uninstalling pandas might be required to be able to install nightly builds:: + + pip uninstall pandas -y Running the test suite ---------------------- @@ -193,116 +224,145 @@ pandas is equipped with an exhaustive set of unit tests, covering about 97% of the code base as of this writing. To run it on your machine to verify that everything is working (and that you have all of the dependencies, soft and hard, installed), make sure you have `pytest -`__ >= 5.0.1 and `Hypothesis -`__ >= 3.58, then run: +`__ >= 7.0 and `Hypothesis +`__ >= 6.34.2, then run: :: >>> pd.test() - running: pytest --skip-slow --skip-network C:\Users\TP\Anaconda3\envs\py36\lib\site-packages\pandas - ============================= test session starts ============================= - platform win32 -- Python 3.6.2, pytest-3.6.0, py-1.4.34, pluggy-0.4.0 - rootdir: C:\Users\TP\Documents\Python\pandasdev\pandas, inifile: setup.cfg - collected 12145 items / 3 skipped + running: pytest --skip-slow --skip-network --skip-db /home/user/anaconda3/lib/python3.9/site-packages/pandas + + ============================= test session starts ============================== + platform linux -- Python 3.9.7, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 + rootdir: /home/user + plugins: dash-1.19.0, anyio-3.5.0, hypothesis-6.29.3 + collected 154975 items / 4 skipped / 154971 selected + ........................................................................ [ 0%] + ........................................................................ [ 99%] + ....................................... [100%] + + ==================================== ERRORS ==================================== + + =================================== FAILURES =================================== + + =============================== warnings summary =============================== + + =========================== short test summary info ============================ - ..................................................................S...... - ........S................................................................ - ......................................................................... + = 1 failed, 146194 passed, 7402 skipped, 1367 xfailed, 5 xpassed, 197 warnings, 10 errors in 1090.16s (0:18:10) = - ==================== 12130 passed, 12 skipped in 368.339 seconds ===================== +This is just an example of what information is shown. You might see a slightly different result as what is shown above. .. _install.dependencies: Dependencies ------------ +.. _install.required_dependencies: + +Required dependencies +~~~~~~~~~~~~~~~~~~~~~ + +pandas requires the following dependencies. + ================================================================ ========================== Package Minimum supported version ================================================================ ========================== -`setuptools `__ 24.2.0 -`NumPy `__ 1.15.4 -`python-dateutil `__ 2.7.3 -`pytz `__ 2017.2 +`NumPy `__ 1.20.3 +`python-dateutil `__ 2.8.2 +`pytz `__ 2020.1 ================================================================ ========================== -.. _install.recommended_dependencies: +.. _install.optional_dependencies: -Recommended dependencies -~~~~~~~~~~~~~~~~~~~~~~~~ +Optional dependencies +~~~~~~~~~~~~~~~~~~~~~ -* `numexpr `__: for accelerating certain numerical operations. - ``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups. - If installed, must be Version 2.6.2 or higher. +pandas has many optional dependencies that are only used for specific methods. +For example, :func:`pandas.read_hdf` requires the ``pytables`` package, while +:meth:`DataFrame.to_markdown` requires the ``tabulate`` package. If the +optional dependency is not installed, pandas will raise an ``ImportError`` when +the method requiring that dependency is called. + +If using pip, optional pandas dependencies can be installed or managed in a file (e.g. requirements.txt or pyproject.toml) +as optional extras (e.g.,``pandas[performance, aws]>=1.5.0``). All optional dependencies can be installed with ``pandas[all]``, +and specific sets of dependencies are listed in the sections below. + +.. _install.recommended_dependencies: -* `bottleneck `__: for accelerating certain types of ``nan`` - evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. If installed, - must be Version 1.2.1 or higher. +Performance dependencies (recommended) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. note:: You are highly encouraged to install these libraries, as they provide speed improvements, especially when working with large data sets. +Installable with ``pip install "pandas[performance]"`` -.. _install.optional_dependencies: +===================================================== ================== ================== =================================================================================================================================================================================== +Dependency Minimum Version pip extra Notes +===================================================== ================== ================== =================================================================================================================================================================================== +`numexpr `__ 2.7.3 performance Accelerates certain numerical operations by using uses multiple cores as well as smart chunking and caching to achieve large speedups +`bottleneck `__ 1.3.2 performance Accelerates certain types of ``nan`` by using specialized cython routines to achieve large speedup. +`numba `__ 0.53.1 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler. +===================================================== ================== ================== =================================================================================================================================================================================== -Optional dependencies -~~~~~~~~~~~~~~~~~~~~~ +Visualization +^^^^^^^^^^^^^ -Pandas has many optional dependencies that are only used for specific methods. -For example, :func:`pandas.read_hdf` requires the ``pytables`` package, while -:meth:`DataFrame.to_markdown` requires the ``tabulate`` package. If the -optional dependency is not installed, pandas will raise an ``ImportError`` when -the method requiring that dependency is called. +Installable with ``pip install "pandas[plot, output_formatting]"``. -========================= ================== ============================================================= -Dependency Minimum Version Notes -========================= ================== ============================================================= -BeautifulSoup4 4.6.0 HTML parser for read_html (see :ref:`note `) -Jinja2 Conditional formatting with DataFrame.style -PyQt4 Clipboard I/O -PyQt5 Clipboard I/O -PyTables 3.4.3 HDF5-based reading / writing -SQLAlchemy 1.1.4 SQL support for databases other than sqlite -SciPy 0.19.0 Miscellaneous statistical functions -XLsxWriter 0.9.8 Excel writing -blosc Compression for HDF5 -fsspec 0.7.4 Handling files aside from local and HTTP -fastparquet 0.3.2 Parquet reading / writing -gcsfs 0.6.0 Google Cloud Storage access -html5lib HTML parser for read_html (see :ref:`note `) -lxml 3.8.0 HTML parser for read_html (see :ref:`note `) -matplotlib 2.2.2 Visualization -numba 0.46.0 Alternative execution engine for rolling operations -openpyxl 2.5.7 Reading / writing for xlsx files -pandas-gbq 0.12.0 Google Big Query access -psycopg2 PostgreSQL engine for sqlalchemy -pyarrow 0.12.0 Parquet, ORC (requires 0.13.0), and feather reading / writing -pymysql 0.7.11 MySQL engine for sqlalchemy -pyreadstat SPSS files (.sav) reading -pytables 3.4.3 HDF5 reading / writing -pyxlsb 1.0.6 Reading for xlsb files -qtpy Clipboard I/O -s3fs 0.4.0 Amazon S3 access -tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_) -xarray 0.8.2 pandas-like API for N-dimensional data -xclip Clipboard I/O on linux -xlrd 1.1.0 Excel reading -xlwt 1.2.0 Excel writing -xsel Clipboard I/O on linux -zlib Compression for HDF5 -========================= ================== ============================================================= - -.. _optional_html: - -Optional dependencies for parsing HTML -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +========================= ================== ================== ============================================================= +Dependency Minimum Version pip extra Notes +========================= ================== ================== ============================================================= +matplotlib 3.6.1 plot Plotting library +Jinja2 3.0.0 output_formatting Conditional formatting with DataFrame.style +tabulate 0.8.9 output_formatting Printing in Markdown-friendly format (see `tabulate`_) +========================= ================== ================== ============================================================= + +Computation +^^^^^^^^^^^ + +Installable with ``pip install "pandas[computation]"``. + +========================= ================== =============== ============================================================= +Dependency Minimum Version pip extra Notes +========================= ================== =============== ============================================================= +SciPy 1.7.1 computation Miscellaneous statistical functions +xarray 0.21.0 computation pandas-like API for N-dimensional data +========================= ================== =============== ============================================================= + +Excel files +^^^^^^^^^^^ + +Installable with ``pip install "pandas[excel]"``. + +========================= ================== =============== ============================================================= +Dependency Minimum Version pip extra Notes +========================= ================== =============== ============================================================= +xlrd 2.0.1 excel Reading Excel +xlsxwriter 1.4.3 excel Writing Excel +openpyxl 3.0.7 excel Reading / writing for xlsx files +pyxlsb 1.0.8 excel Reading for xlsb files +========================= ================== =============== ============================================================= + +HTML +^^^^ + +Installable with ``pip install "pandas[html]"``. + +========================= ================== =============== ============================================================= +Dependency Minimum Version pip extra Notes +========================= ================== =============== ============================================================= +BeautifulSoup4 4.9.3 html HTML parser for read_html +html5lib 1.1 html HTML parser for read_html +lxml 4.6.3 html HTML parser for read_html +========================= ================== =============== ============================================================= One of the following combinations of libraries is needed to use the top-level :func:`~pandas.read_html` function: -.. versionchanged:: 0.23.0 - * `BeautifulSoup4`_ and `html5lib`_ * `BeautifulSoup4`_ and `lxml`_ * `BeautifulSoup4`_ and `html5lib`_ and `lxml`_ @@ -323,3 +383,107 @@ top-level :func:`~pandas.read_html` function: .. _BeautifulSoup4: https://www.crummy.com/software/BeautifulSoup .. _lxml: https://lxml.de .. _tabulate: https://github.com/astanin/python-tabulate + +XML +^^^ + +Installable with ``pip install "pandas[xml]"``. + +========================= ================== =============== ============================================================= +Dependency Minimum Version pip extra Notes +========================= ================== =============== ============================================================= +lxml 4.6.3 xml XML parser for read_xml and tree builder for to_xml +========================= ================== =============== ============================================================= + +SQL databases +^^^^^^^^^^^^^ + +Installable with ``pip install "pandas[postgresql, mysql, sql-other]"``. + +========================= ================== =============== ============================================================= +Dependency Minimum Version pip extra Notes +========================= ================== =============== ============================================================= +SQLAlchemy 1.4.16 postgresql, SQL support for databases other than sqlite + mysql, + sql-other +psycopg2 2.8.6 postgresql PostgreSQL engine for sqlalchemy +pymysql 1.0.2 mysql MySQL engine for sqlalchemy +========================= ================== =============== ============================================================= + +Other data sources +^^^^^^^^^^^^^^^^^^ + +Installable with ``pip install "pandas[hdf5, parquet, feather, spss, excel]"`` + +========================= ================== ================ ============================================================= +Dependency Minimum Version pip extra Notes +========================= ================== ================ ============================================================= +PyTables 3.6.1 hdf5 HDF5-based reading / writing +blosc 1.21.0 hdf5 Compression for HDF5; only available on ``conda`` +zlib hdf5 Compression for HDF5 +fastparquet 0.6.3 - Parquet reading / writing (pyarrow is default) +pyarrow 7.0.0 parquet, feather Parquet, ORC, and feather reading / writing +pyreadstat 1.1.2 spss SPSS files (.sav) reading +odfpy 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing +========================= ================== ================ ============================================================= + +.. _install.warn_orc: + +.. warning:: + + * If you want to use :func:`~pandas.read_orc`, it is highly recommended to install pyarrow using conda. + The following is a summary of the environment in which :func:`~pandas.read_orc` can work. + + ========================= ================== ============================================================= + System Conda PyPI + ========================= ================== ============================================================= + Linux Successful Failed + macOS Successful Failed + Windows Failed Failed + ========================= ================== ============================================================= + +Access data in the cloud +^^^^^^^^^^^^^^^^^^^^^^^^ + +Installable with ``pip install "pandas[fss, aws, gcp]"`` + +========================= ================== =============== ============================================================= +Dependency Minimum Version pip extra Notes +========================= ================== =============== ============================================================= +fsspec 2021.7.0 fss, gcp, aws Handling files aside from simple local and HTTP (required + dependency of s3fs, gcsfs). +gcsfs 2021.7.0 gcp Google Cloud Storage access +pandas-gbq 0.15.0 gcp Google Big Query access +s3fs 2021.08.0 aws Amazon S3 access +========================= ================== =============== ============================================================= + +Clipboard +^^^^^^^^^ + +Installable with ``pip install "pandas[clipboard]"``. + +========================= ================== =============== ============================================================= +Dependency Minimum Version pip extra Notes +========================= ================== =============== ============================================================= +PyQt4/PyQt5 5.15.1 clipboard Clipboard I/O +qtpy 2.2.0 clipboard Clipboard I/O +========================= ================== =============== ============================================================= + +.. note:: + + Depending on operating system, system-level packages may need to installed. + For clipboard to operate on Linux one of the CLI tools ``xclip`` or ``xsel`` must be installed on your system. + + +Compression +^^^^^^^^^^^ + +Installable with ``pip install "pandas[compression]"`` + +========================= ================== =============== ============================================================= +Dependency Minimum Version pip extra Notes +========================= ================== =============== ============================================================= +brotli 0.7.0 compression Brotli compression +python-snappy 0.6.0 compression Snappy compression +Zstandard 0.15.2 compression Zstandard compression +========================= ================== =============== ============================================================= diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst index dc9bec2284aab..2dcc8b0abe3b8 100644 --- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst +++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst @@ -41,12 +41,16 @@ I want to store passenger data of the Titanic. For a number of passengers, I kno .. ipython:: python - df = pd.DataFrame({ - "Name": ["Braund, Mr. Owen Harris", - "Allen, Mr. William Henry", - "Bonnell, Miss. Elizabeth"], - "Age": [22, 35, 58], - "Sex": ["male", "male", "female"]} + df = pd.DataFrame( + { + "Name": [ + "Braund, Mr. Owen Harris", + "Allen, Mr. William Henry", + "Bonnell, Miss. Elizabeth", + ], + "Age": [22, 35, 58], + "Sex": ["male", "male", "female"], + } ) df @@ -172,7 +176,7 @@ these are by default not taken into account by the :func:`~DataFrame.describe` m Many pandas operations return a ``DataFrame`` or a ``Series``. The :func:`~DataFrame.describe` method is an example of a pandas operation returning a -pandas ``Series``. +pandas ``Series`` or a pandas ``DataFrame``. .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst index c6c6bfefc4303..dbb1be8c4d875 100644 --- a/doc/source/getting_started/intro_tutorials/02_read_write.rst +++ b/doc/source/getting_started/intro_tutorials/02_read_write.rst @@ -9,45 +9,20 @@ .. raw:: html
-
+
Data used for this tutorial:
    -
  • - -
    -
    -

    - -This tutorial uses the Titanic data set, stored as CSV. The data -consists of the following data columns: - -- PassengerId: Id of every passenger. -- Survived: This feature have value 0 and 1. 0 for not survived and 1 - for survived. -- Pclass: There are 3 classes: Class 1, Class 2 and Class 3. -- Name: Name of passenger. -- Sex: Gender of passenger. -- Age: Age of passenger. -- SibSp: Indication that passenger have siblings and spouse. -- Parch: Whether a passenger is alone or have family. -- Ticket: Ticket number of passenger. -- Fare: Indicating the fare. -- Cabin: The cabin of passenger. -- Embarked: The embarked category. +

  • + +.. include:: includes/titanic.rst .. raw:: html -

    - To raw data -
-
- - + +
How do I read and write tabular data? @@ -138,7 +113,7 @@ My colleague requested the Titanic data as a spreadsheet. .. ipython:: python - titanic.to_excel('titanic.xlsx', sheet_name='passengers', index=False) + titanic.to_excel("titanic.xlsx", sheet_name="passengers", index=False) Whereas ``read_*`` functions are used to read data to pandas, the ``to_*`` methods are used to store data. The :meth:`~DataFrame.to_excel` method stores @@ -156,7 +131,7 @@ The equivalent read function :meth:`~DataFrame.read_excel` will reload the data .. ipython:: python - titanic = pd.read_excel('titanic.xlsx', sheet_name='passengers') + titanic = pd.read_excel("titanic.xlsx", sheet_name="passengers") .. ipython:: python @@ -166,7 +141,8 @@ The equivalent read function :meth:`~DataFrame.read_excel` will reload the data :suppress: import os - os.remove('titanic.xlsx') + + os.remove("titanic.xlsx") .. raw:: html @@ -222,7 +198,7 @@ The method :meth:`~DataFrame.info` provides technical information about a .. raw:: html -
+
To user guide For a complete overview of the input and output possibilities from and to pandas, see the user guide section about :ref:`reader and writer functions `. diff --git a/doc/source/getting_started/intro_tutorials/03_subset_data.rst b/doc/source/getting_started/intro_tutorials/03_subset_data.rst index 8476fee5e1eee..483e122ec4d42 100644 --- a/doc/source/getting_started/intro_tutorials/03_subset_data.rst +++ b/doc/source/getting_started/intro_tutorials/03_subset_data.rst @@ -9,43 +9,15 @@ .. raw:: html
-
+
Data used for this tutorial:
    -
  • - -
    -
    -

    - -This tutorial uses the Titanic data set, stored as CSV. The data -consists of the following data columns: - -- PassengerId: Id of every passenger. -- Survived: This feature have value 0 and 1. 0 for not survived and 1 - for survived. -- Pclass: There are 3 classes: Class 1, Class 2 and Class 3. -- Name: Name of passenger. -- Sex: Gender of passenger. -- Age: Age of passenger. -- SibSp: Indication that passenger have siblings and spouse. -- Parch: Whether a passenger is alone or have family. -- Ticket: Ticket number of passenger. -- Fare: Indicating the fare. -- Cabin: The cabin of passenger. -- Embarked: The embarked category. +

  • -.. raw:: html - -

    - To raw data -
-
+.. include:: includes/titanic.rst .. ipython:: python @@ -54,8 +26,8 @@ consists of the following data columns: .. raw:: html - - + +
How do I select a subset of a ``DataFrame``? @@ -199,7 +171,7 @@ selection brackets ``[]``. Only rows for which the value is ``True`` will be selected. We know from before that the original Titanic ``DataFrame`` consists of -891 rows. Let’s have a look at the amount of rows which satisfy the +891 rows. Let’s have a look at the number of rows which satisfy the condition by checking the ``shape`` attribute of the resulting ``DataFrame`` ``above_35``: @@ -270,7 +242,7 @@ I want to work with passenger data for which the age is known. age_no_na.head() The :meth:`~Series.notna` conditional function returns a ``True`` for each row the -values are not an ``Null`` value. As such, this can be combined with the +values are not a ``Null`` value. As such, this can be combined with the selection brackets ``[]`` to filter the data table. .. raw:: html @@ -296,6 +268,8 @@ For more dedicated functions on missing values, see the user guide section about
+.. _10min_tut_03_subset.rows_and_columns: + How do I select specific rows and columns from a ``DataFrame``? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -384,9 +358,9 @@ See the user guide section on :ref:`different choices for indexing To user guide -A full overview about indexing is provided in the user guide pages on :ref:`indexing and selecting data `. +A full overview of indexing is provided in the user guide pages on :ref:`indexing and selecting data `. .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/04_plotting.rst b/doc/source/getting_started/intro_tutorials/04_plotting.rst index f3d99ee56359a..ddc8a37911c98 100644 --- a/doc/source/getting_started/intro_tutorials/04_plotting.rst +++ b/doc/source/getting_started/intro_tutorials/04_plotting.rst @@ -2,6 +2,12 @@ {{ header }} +How do I create plots in pandas? +---------------------------------- + +.. image:: ../../_static/schemas/04_plot_overview.svg + :align: center + .. ipython:: python import pandas as pd @@ -10,38 +16,19 @@ .. raw:: html
-
+
Data used for this tutorial:
    -
  • - -
    -
    -

    - -For this tutorial, air quality data about :math:`NO_2` is used, made -available by `openaq `__ and using the -`py-openaq `__ package. -The ``air_quality_no2.csv`` data set provides :math:`NO_2` values for -the measurement stations *FR04014*, *BETR801* and *London Westminster* -in respectively Paris, Antwerp and London. - -.. raw:: html - -

    - To raw data -
    -
    +
  • + +.. include:: includes/air_quality_no2.rst .. ipython:: python - air_quality = pd.read_csv("data/air_quality_no2.csv", - index_col=0, parse_dates=True) + air_quality = pd.read_csv("data/air_quality_no2.csv", index_col=0, parse_dates=True) air_quality.head() .. note:: @@ -54,12 +41,6 @@ in respectively Paris, Antwerp and London.
-How to create plots in pandas? ------------------------------- - -.. image:: ../../_static/schemas/04_plot_overview.svg - :align: center - .. raw:: html
    @@ -71,6 +52,7 @@ I want a quick visual check of the data. @savefig 04_airqual_quick.png air_quality.plot() + plt.show() With a ``DataFrame``, pandas creates by default one line plot for each of the columns with numeric data. @@ -87,10 +69,19 @@ the columns with numeric data. I want to plot only the columns of the data table with the data from Paris. +.. ipython:: python + :suppress: + + # We need to clear the figure here as, within doc generation, the plot + # accumulates data on each plot(). This is not needed when running + # in a notebook, so is suppressed from output. + plt.clf() + .. ipython:: python @savefig 04_airqual_paris.png air_quality["station_paris"].plot() + plt.show() To plot a specific column, use the selection method of the :ref:`subset data tutorial <10min_tut_03_subset>` in combination with the :meth:`~DataFrame.plot` @@ -107,14 +98,13 @@ method. Hence, the :meth:`~DataFrame.plot` method works on both ``Series`` and
    • -I want to visually compare the :math:`N0_2` values measured in London versus Paris. +I want to visually compare the :math:`NO_2` values measured in London versus Paris. .. ipython:: python @savefig 04_airqual_scatter.png - air_quality.plot.scatter(x="station_london", - y="station_paris", - alpha=0.5) + air_quality.plot.scatter(x="station_london", y="station_paris", alpha=0.5) + plt.show() .. raw:: html @@ -127,12 +117,15 @@ standard Python to get an overview of the available plot methods: .. ipython:: python - [method_name for method_name in dir(air_quality.plot) - if not method_name.startswith("_")] + [ + method_name + for method_name in dir(air_quality.plot) + if not method_name.startswith("_") + ] .. note:: - In many development environments as well as ipython and - jupyter notebook, use the TAB button to get an overview of the available + In many development environments as well as IPython and + Jupyter Notebook, use the TAB button to get an overview of the available methods, for example ``air_quality.plot.`` + TAB. One of the options is :meth:`DataFrame.plot.box`, which refers to a @@ -143,6 +136,7 @@ method is applicable on the air quality example data: @savefig 04_airqual_boxplot.png air_quality.plot.box() + plt.show() .. raw:: html @@ -166,10 +160,11 @@ I want each of the columns in a separate subplot. @savefig 04_airqual_area_subplot.png axs = air_quality.plot.area(figsize=(12, 4), subplots=True) + plt.show() -Separate subplots for each of the data columns is supported by the ``subplots`` argument +Separate subplots for each of the data columns are supported by the ``subplots`` argument of the ``plot`` functions. The builtin options available in each of the pandas plot -functions that are worthwhile to have a look. +functions are worth reviewing. .. raw:: html @@ -196,44 +191,47 @@ I want to further customize, extend or save the resulting plot. .. ipython:: python - fig, axs = plt.subplots(figsize=(12, 4)); - air_quality.plot.area(ax=axs); + fig, axs = plt.subplots(figsize=(12, 4)) + air_quality.plot.area(ax=axs) + axs.set_ylabel("NO$_2$ concentration") @savefig 04_airqual_customized.png - axs.set_ylabel("NO$_2$ concentration"); fig.savefig("no2_concentrations.png") + plt.show() .. ipython:: python :suppress: import os - os.remove('no2_concentrations.png') + + os.remove("no2_concentrations.png") .. raw:: html
    -Each of the plot objects created by pandas are a -`matplotlib `__ object. As Matplotlib provides +Each of the plot objects created by pandas is a +`Matplotlib `__ object. As Matplotlib provides plenty of options to customize plots, making the link between pandas and -Matplotlib explicit enables all the power of matplotlib to the plot. +Matplotlib explicit enables all the power of Matplotlib to the plot. This strategy is applied in the previous example: :: - fig, axs = plt.subplots(figsize=(12, 4)) # Create an empty matplotlib Figure and Axes + fig, axs = plt.subplots(figsize=(12, 4)) # Create an empty Matplotlib Figure and Axes air_quality.plot.area(ax=axs) # Use pandas to put the area plot on the prepared Figure/Axes - axs.set_ylabel("NO$_2$ concentration") # Do any matplotlib customization you like - fig.savefig("no2_concentrations.png") # Save the Figure/Axes using the existing matplotlib method. + axs.set_ylabel("NO$_2$ concentration") # Do any Matplotlib customization you like + fig.savefig("no2_concentrations.png") # Save the Figure/Axes using the existing Matplotlib method. + plt.show() # Display the plot .. raw:: html

    REMEMBER

    -- The ``.plot.*`` methods are applicable on both Series and DataFrames +- The ``.plot.*`` methods are applicable on both Series and DataFrames. - By default, each of the columns is plotted as a different element - (line, boxplot,…) + (line, boxplot,…). - Any plot created by pandas is a Matplotlib object. .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/05_add_columns.rst b/doc/source/getting_started/intro_tutorials/05_add_columns.rst index d4f6a8d6bb4a2..d59a70cc2818e 100644 --- a/doc/source/getting_started/intro_tutorials/05_add_columns.rst +++ b/doc/source/getting_started/intro_tutorials/05_add_columns.rst @@ -9,38 +9,19 @@ .. raw:: html
    -
    +
    Data used for this tutorial:
      -
    • - -
      -
      -

      - -For this tutorial, air quality data about :math:`NO_2` is used, made -available by `openaq `__ and using the -`py-openaq `__ package. -The ``air_quality_no2.csv`` data set provides :math:`NO_2` values for -the measurement stations *FR04014*, *BETR801* and *London Westminster* -in respectively Paris, Antwerp and London. +

    • -.. raw:: html - -

      - To raw data -
    -
    +.. include:: includes/air_quality_no2.rst .. ipython:: python - air_quality = pd.read_csv("data/air_quality_no2.csv", - index_col=0, parse_dates=True) + air_quality = pd.read_csv("data/air_quality_no2.csv", index_col=0, parse_dates=True) air_quality.head() .. raw:: html @@ -49,8 +30,8 @@ in respectively Paris, Antwerp and London.
-How to create new columns derived from existing columns? --------------------------------------------------------- +How to create new columns derived from existing columns +------------------------------------------------------- .. image:: ../../_static/schemas/05_newcolumn_1.svg :align: center @@ -60,7 +41,7 @@ How to create new columns derived from existing columns?
  • -I want to express the :math:`NO_2` concentration of the station in London in mg/m\ :math:`^3` +I want to express the :math:`NO_2` concentration of the station in London in mg/m\ :math:`^3`. (*If we assume temperature of 25 degrees Celsius and pressure of 1013 hPa, the conversion factor is 1.882*) @@ -79,7 +60,7 @@ at the left side of the assignment.
.. note:: - The calculation of the values is done **element_wise**. This + The calculation of the values is done **element-wise**. This means all values in the given column are multiplied by the value 1.882 at once. You do not need to use a loop to iterate each of the rows! @@ -91,12 +72,13 @@ at the left side of the assignment.
  • -I want to check the ratio of the values in Paris versus Antwerp and save the result in a new column +I want to check the ratio of the values in Paris versus Antwerp and save the result in a new column. .. ipython:: python - air_quality["ratio_paris_antwerp"] = \ + air_quality["ratio_paris_antwerp"] = ( air_quality["station_paris"] / air_quality["station_antwerp"] + ) air_quality.head() The calculation is again element-wise, so the ``/`` is applied *for the @@ -107,24 +89,29 @@ values in each row*.
-Also other mathematical operators (+, -, \*, /) or -logical operators (<, >, =,…) work element wise. The latter was already +Also other mathematical operators (``+``, ``-``, ``*``, ``/``,…) or +logical operators (``<``, ``>``, ``==``,…) work element-wise. The latter was already used in the :ref:`subset data tutorial <10min_tut_03_subset>` to filter rows of a table using a conditional expression. +If you need more advanced logic, you can use arbitrary Python code via :meth:`~DataFrame.apply`. + .. raw:: html
  • -I want to rename the data columns to the corresponding station identifiers used by openAQ +I want to rename the data columns to the corresponding station identifiers used by `OpenAQ `__. .. ipython:: python air_quality_renamed = air_quality.rename( - columns={"station_antwerp": "BETR801", - "station_paris": "FR04014", - "station_london": "London Westminster"}) + columns={ + "station_antwerp": "BETR801", + "station_paris": "FR04014", + "station_london": "London Westminster", + } + ) .. ipython:: python diff --git a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst index c7363b94146ac..fe3ae820e7085 100644 --- a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst +++ b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst @@ -9,43 +9,15 @@ .. raw:: html
    -
    +
    Data used for this tutorial:
      -
    • - -
      -
      -

      - -This tutorial uses the Titanic data set, stored as CSV. The data -consists of the following data columns: - -- PassengerId: Id of every passenger. -- Survived: This feature have value 0 and 1. 0 for not survived and 1 - for survived. -- Pclass: There are 3 classes: Class 1, Class 2 and Class 3. -- Name: Name of passenger. -- Sex: Gender of passenger. -- Age: Age of passenger. -- SibSp: Indication that passenger have siblings and spouse. -- Parch: Whether a passenger is alone or have family. -- Ticket: Ticket number of passenger. -- Fare: Indicating the fare. -- Cabin: The cabin of passenger. -- Embarked: The embarked category. +

    • -.. raw:: html - -

      - To raw data -
    -
    +.. include:: includes/titanic.rst .. ipython:: python @@ -54,12 +26,12 @@ consists of the following data columns: .. raw:: html -
  • -
+ +
-How to calculate summary statistics? ------------------------------------- +How to calculate summary statistics +----------------------------------- Aggregating statistics ~~~~~~~~~~~~~~~~~~~~~~ @@ -102,7 +74,7 @@ What is the median age and ticket fare price of the Titanic passengers? titanic[["Age", "Fare"]].median() The statistic applied to multiple columns of a ``DataFrame`` (the selection of two columns -return a ``DataFrame``, see the :ref:`subset data tutorial <10min_tut_03_subset>`) is calculated for each numeric column. +returns a ``DataFrame``, see the :ref:`subset data tutorial <10min_tut_03_subset>`) is calculated for each numeric column. .. raw:: html @@ -110,7 +82,7 @@ return a ``DataFrame``, see the :ref:`subset data tutorial <10min_tut_03_subset> The aggregating statistic can be calculated for multiple columns at the -same time. Remember the ``describe`` function from :ref:`first tutorial <10min_tut_01_tableoriented>` tutorial? +same time. Remember the ``describe`` function from the :ref:`first tutorial <10min_tut_01_tableoriented>`? .. ipython:: python @@ -122,8 +94,12 @@ aggregating statistics for given columns can be defined using the .. ipython:: python - titanic.agg({'Age': ['min', 'max', 'median', 'skew'], - 'Fare': ['min', 'max', 'median', 'mean']}) + titanic.agg( + { + "Age": ["min", "max", "median", "skew"], + "Fare": ["min", "max", "median", "mean"], + } + ) .. raw:: html @@ -167,8 +143,8 @@ returned. Calculating a given statistic (e.g. ``mean`` age) *for each category in a column* (e.g. male/female in the ``Sex`` column) is a common pattern. -The ``groupby`` method is used to support this type of operations. More -general, this fits in the more general ``split-apply-combine`` pattern: +The ``groupby`` method is used to support this type of operations. This +fits in the more general ``split-apply-combine`` pattern: - **Split** the data into groups - **Apply** a function to each group independently @@ -178,14 +154,14 @@ The apply and combine steps are typically done together in pandas. In the previous example, we explicitly selected the 2 columns first. If not, the ``mean`` method is applied to each column containing numerical -columns: +columns by passing ``numeric_only=True``: .. ipython:: python - titanic.groupby("Sex").mean() + titanic.groupby("Sex").mean(numeric_only=True) It does not make much sense to get the average value of the ``Pclass``. -if we are only interested in the average age for each gender, the +If we are only interested in the average age for each gender, the selection of columns (rectangular brackets ``[]`` as usual) is supported on the grouped data as well: @@ -197,7 +173,7 @@ on the grouped data as well: :align: center .. note:: - The `Pclass` column contains numerical data but actually + The ``Pclass`` column contains numerical data but actually represents 3 categories (or factors) with respectively the labels ‘1’, ‘2’ and ‘3’. Calculating statistics on these does not make much sense. Therefore, pandas provides a ``Categorical`` data type to handle this @@ -278,7 +254,7 @@ within each group:
To user guide -The user guide has a dedicated section on ``value_counts`` , see page on :ref:`discretization `. +The user guide has a dedicated section on ``value_counts`` , see the page on :ref:`discretization `. .. raw:: html @@ -289,10 +265,10 @@ The user guide has a dedicated section on ``value_counts`` , see page on :ref:`d

REMEMBER

-- Aggregation statistics can be calculated on entire columns or rows -- ``groupby`` provides the power of the *split-apply-combine* pattern +- Aggregation statistics can be calculated on entire columns or rows. +- ``groupby`` provides the power of the *split-apply-combine* pattern. - ``value_counts`` is a convenient shortcut to count the number of - entries in each category of a variable + entries in each category of a variable. .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst b/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst index c16fec6aaba9f..6c920c92e4d05 100644 --- a/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst +++ b/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst @@ -9,43 +9,15 @@ .. raw:: html
-
+
Data used for this tutorial:
    -
  • - -
    -
    -

    - -This tutorial uses the Titanic data set, stored as CSV. The data -consists of the following data columns: - -- PassengerId: Id of every passenger. -- Survived: This feature have value 0 and 1. 0 for not survived and 1 - for survived. -- Pclass: There are 3 classes: Class 1, Class 2 and Class 3. -- Name: Name of passenger. -- Sex: Gender of passenger. -- Age: Age of passenger. -- SibSp: Indication that passenger have siblings and spouse. -- Parch: Whether a passenger is alone or have family. -- Ticket: Ticket number of passenger. -- Fare: Indicating the fare. -- Cabin: The cabin of passenger. -- Embarked: The embarked category. +

  • -.. raw:: html - -

    - To raw data -
-
+.. include:: includes/titanic.rst .. ipython:: python @@ -55,7 +27,7 @@ consists of the following data columns: .. raw:: html -
  • +
  • @@ -65,7 +37,7 @@ consists of the following data columns: This tutorial uses air quality data about :math:`NO_2` and Particulate matter less than 2.5 micrometers, made available by -`openaq `__ and using the +`OpenAQ `__ and using the `py-openaq `__ package. The ``air_quality_long.csv`` data set provides :math:`NO_2` and :math:`PM_{25}` values for the measurement stations *FR04014*, *BETR801* @@ -95,24 +67,25 @@ measurement. .. raw:: html

    - To raw data + To raw data
  • .. ipython:: python - air_quality = pd.read_csv("data/air_quality_long.csv", - index_col="date.utc", parse_dates=True) + air_quality = pd.read_csv( + "data/air_quality_long.csv", index_col="date.utc", parse_dates=True + ) air_quality.head() .. raw:: html - - + +
    -How to reshape the layout of tables? ------------------------------------- +How to reshape the layout of tables +----------------------------------- Sort table rows ~~~~~~~~~~~~~~~ @@ -144,7 +117,7 @@ I want to sort the Titanic data according to the cabin class and age in descendi titanic.sort_values(by=['Pclass', 'Age'], ascending=False).head() -With :meth:`Series.sort_values`, the rows in the table are sorted according to the +With :meth:`DataFrame.sort_values`, the rows in the table are sorted according to the defined column(s). The index will follow the row order. .. raw:: html @@ -157,7 +130,7 @@ defined column(s). The index will follow the row order.
    To user guide -More details about sorting of tables is provided in the using guide section on :ref:`sorting data `. +More details about sorting of tables is provided in the user guide section on :ref:`sorting data `. .. raw:: html @@ -169,7 +142,7 @@ Long to wide table format Let’s use a small subset of the air quality data set. We focus on :math:`NO_2` data and only use the first two measurements of each location (i.e. the head of each group). The subset of data will be -called ``no2_subset`` +called ``no2_subset``. .. ipython:: python @@ -190,7 +163,7 @@ called ``no2_subset``
    • -I want the values for the three stations as separate columns next to each other +I want the values for the three stations as separate columns next to each other. .. ipython:: python @@ -204,7 +177,7 @@ for each index/column combination is required.
    -As pandas support plotting of multiple columns (see :ref:`plotting tutorial <10min_tut_04_plotting>`) out of the box, the conversion from +As pandas supports plotting of multiple columns (see :ref:`plotting tutorial <10min_tut_04_plotting>`) out of the box, the conversion from *long* to *wide* table format enables the plotting of the different time series at the same time: @@ -243,16 +216,17 @@ Pivot table
    • -I want the mean concentrations for :math:`NO_2` and :math:`PM_{2.5}` in each of the stations in table form +I want the mean concentrations for :math:`NO_2` and :math:`PM_{2.5}` in each of the stations in table form. .. ipython:: python - air_quality.pivot_table(values="value", index="location", - columns="parameter", aggfunc="mean") + air_quality.pivot_table( + values="value", index="location", columns="parameter", aggfunc="mean" + ) In the case of :meth:`~DataFrame.pivot`, the data is only rearranged. When multiple values need to be aggregated (in this specific case, the values on -different time steps) :meth:`~DataFrame.pivot_table` can be used, providing an +different time steps), :meth:`~DataFrame.pivot_table` can be used, providing an aggregation function (e.g. mean) on how to combine these values. .. raw:: html @@ -261,14 +235,18 @@ aggregation function (e.g. mean) on how to combine these values.
    Pivot table is a well known concept in spreadsheet software. When -interested in summary columns for each variable separately as well, put -the ``margin`` parameter to ``True``: +interested in the row/column margins (subtotals) for each variable, set +the ``margins`` parameter to ``True``: .. ipython:: python - air_quality.pivot_table(values="value", index="location", - columns="parameter", aggfunc="mean", - margins=True) + air_quality.pivot_table( + values="value", + index="location", + columns="parameter", + aggfunc="mean", + margins=True, + ) .. raw:: html @@ -305,7 +283,7 @@ Wide to long format ~~~~~~~~~~~~~~~~~~~ Starting again from the wide format table created in the previous -section: +section, we add a new index to the ``DataFrame`` with :meth:`~DataFrame.reset_index`. .. ipython:: python @@ -320,7 +298,7 @@ section:
    • -I want to collect all air quality :math:`NO_2` measurements in a single column (long format) +I want to collect all air quality :math:`NO_2` measurements in a single column (long format). .. ipython:: python @@ -341,21 +319,21 @@ will *melt* all columns NOT mentioned in ``id_vars`` together into two columns: A column with the column header names and a column with the values itself. The latter column gets by default the name ``value``. -The :func:`pandas.melt` method can be defined in more detail: +The parameters passed to :func:`pandas.melt` can be defined in more detail: .. ipython:: python - no_2 = no2_pivoted.melt(id_vars="date.utc", - value_vars=["BETR801", - "FR04014", - "London Westminster"], - value_name="NO_2", - var_name="id_location") + no_2 = no2_pivoted.melt( + id_vars="date.utc", + value_vars=["BETR801", "FR04014", "London Westminster"], + value_name="NO_2", + var_name="id_location", + ) no_2.head() -The result in the same, but in more detail defined: +The additional parameters have the following effects: -- ``value_vars`` defines explicitly which columns to *melt* together +- ``value_vars`` defines which columns to *melt* together - ``value_name`` provides a custom column name for the values column instead of the default column name ``value`` - ``var_name`` provides a custom column name for the column collecting @@ -382,11 +360,11 @@ Conversion from wide to long format with :func:`pandas.melt` is explained in the

      REMEMBER

      -- Sorting by one or more columns is supported by ``sort_values`` +- Sorting by one or more columns is supported by ``sort_values``. - The ``pivot`` function is purely restructuring of the data, - ``pivot_table`` supports aggregations + ``pivot_table`` supports aggregations. - The reverse of ``pivot`` (long to wide format) is ``melt`` (wide to - long format) + long format). .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst index 600a75b156ac4..f369feb3e03a5 100644 --- a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst +++ b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst @@ -9,13 +9,13 @@ .. raw:: html
      -
      +
      Data used for this tutorial:
        -
      • +
      • @@ -24,7 +24,7 @@

        For this tutorial, air quality data about :math:`NO_2` is used, made available by -`openaq `__ and downloaded using the +`OpenAQ `__ and downloaded using the `py-openaq `__ package. The ``air_quality_no2_long.csv`` data set provides :math:`NO_2` @@ -34,7 +34,7 @@ Westminster* in respectively Paris, Antwerp and London. .. raw:: html

        - To raw data + To raw data
      @@ -49,7 +49,7 @@ Westminster* in respectively Paris, Antwerp and London. .. raw:: html
    • -
    • +
    • @@ -59,7 +59,7 @@ Westminster* in respectively Paris, Antwerp and London. For this tutorial, air quality data about Particulate matter less than 2.5 micrometers is used, made available by -`openaq `__ and downloaded using the +`OpenAQ `__ and downloaded using the `py-openaq `__ package. The ``air_quality_pm25_long.csv`` data set provides :math:`PM_{25}` @@ -69,7 +69,7 @@ Westminster* in respectively Paris, Antwerp and London. .. raw:: html

      - To raw data + To raw data
    @@ -88,8 +88,8 @@ Westminster* in respectively Paris, Antwerp and London.
    -How to combine data from multiple tables? ------------------------------------------ +How to combine data from multiple tables +---------------------------------------- Concatenating objects ~~~~~~~~~~~~~~~~~~~~~ @@ -102,7 +102,7 @@ Concatenating objects
    • -I want to combine the measurements of :math:`NO_2` and :math:`PM_{25}`, two tables with a similar structure, in a single table +I want to combine the measurements of :math:`NO_2` and :math:`PM_{25}`, two tables with a similar structure, in a single table. .. ipython:: python @@ -110,7 +110,7 @@ I want to combine the measurements of :math:`NO_2` and :math:`PM_{25}`, two tabl air_quality.head() The :func:`~pandas.concat` function performs concatenation operations of multiple -tables along one of the axis (row-wise or column-wise). +tables along one of the axes (row-wise or column-wise). .. raw:: html @@ -123,9 +123,9 @@ concatenated tables to verify the operation: .. ipython:: python - print('Shape of the `air_quality_pm25` table: ', air_quality_pm25.shape) - print('Shape of the `air_quality_no2` table: ', air_quality_no2.shape) - print('Shape of the resulting `air_quality` table: ', air_quality.shape) + print('Shape of the ``air_quality_pm25`` table: ', air_quality_pm25.shape) + print('Shape of the ``air_quality_no2`` table: ', air_quality_no2.shape) + print('Shape of the resulting ``air_quality`` table: ', air_quality.shape) Hence, the resulting table has 3178 = 1110 + 2068 rows. @@ -149,17 +149,13 @@ origin of the table (either ``no2`` from table ``air_quality_no2`` or In this specific example, the ``parameter`` column provided by the data ensures that each of the original tables can be identified. This is not -always the case. the ``concat`` function provides a convenient solution +always the case. The ``concat`` function provides a convenient solution with the ``keys`` argument, adding an additional (hierarchical) row index. For example: .. ipython:: python - air_quality_ = pd.concat([air_quality_pm25, air_quality_no2], - keys=["PM25", "NO2"]) - -.. ipython:: python - + air_quality_ = pd.concat([air_quality_pm25, air_quality_no2], keys=["PM25", "NO2"]) air_quality_.head() .. note:: @@ -233,8 +229,7 @@ Add the station coordinates, provided by the stations metadata table, to the cor .. ipython:: python - air_quality = pd.merge(air_quality, stations_coord, - how='left', on='location') + air_quality = pd.merge(air_quality, stations_coord, how="left", on="location") air_quality.head() Using the :meth:`~pandas.merge` function, for each of the rows in the @@ -256,7 +251,7 @@ supports multiple join options similar to database-style operations.
      • -Add the parameter full description and name, provided by the parameters metadata table, to the measurements table +Add the parameters' full description and name, provided by the parameters metadata table, to the measurements table. .. warning:: The air quality parameters metadata are stored in a data file diff --git a/doc/source/getting_started/intro_tutorials/09_timeseries.rst b/doc/source/getting_started/intro_tutorials/09_timeseries.rst index 19351e0e3bc75..76dd836098f58 100644 --- a/doc/source/getting_started/intro_tutorials/09_timeseries.rst +++ b/doc/source/getting_started/intro_tutorials/09_timeseries.rst @@ -10,13 +10,13 @@ .. raw:: html
        -
        +
        Data used for this tutorial:
          -
        • +
        • @@ -26,7 +26,7 @@ For this tutorial, air quality data about :math:`NO_2` and Particulate matter less than 2.5 micrometers is used, made available by -`openaq `__ and downloaded using the +`OpenAQ `__ and downloaded using the `py-openaq `__ package. The ``air_quality_no2_long.csv"`` data set provides :math:`NO_2` values for the measurement stations *FR04014*, *BETR801* and *London @@ -35,7 +35,7 @@ Westminster* in respectively Paris, Antwerp and London. .. raw:: html

          - To raw data + To raw data
        @@ -55,8 +55,10 @@ Westminster* in respectively Paris, Antwerp and London.
    -How to handle time series data with ease? ------------------------------------------ +How to handle time series data with ease +---------------------------------------- + +.. _10min_tut_09_timeseries.properties: Using pandas datetime properties ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -142,7 +144,7 @@ I want to add a new column to the ``DataFrame`` containing only the month of the By using ``Timestamp`` objects for dates, a lot of time-related properties are provided by pandas. For example the ``month``, but also -``year``, ``weekofyear``, ``quarter``,… All of these properties are +``year``, ``quarter``,… All of these properties are accessible by the ``dt`` accessor. .. raw:: html @@ -204,11 +206,10 @@ Plot the typical :math:`NO_2` pattern during the day of our time series of all s .. ipython:: python fig, axs = plt.subplots(figsize=(12, 4)) - air_quality.groupby( - air_quality["datetime"].dt.hour)["value"].mean().plot(kind='bar', - rot=0, - ax=axs) - plt.xlabel("Hour of the day"); # custom x label using matplotlib + air_quality.groupby(air_quality["datetime"].dt.hour)["value"].mean().plot( + kind='bar', rot=0, ax=axs + ) + plt.xlabel("Hour of the day"); # custom x label using Matplotlib @savefig 09_bar_chart.png plt.ylabel("$NO_2 (µg/m^3)$"); diff --git a/doc/source/getting_started/intro_tutorials/10_text_data.rst b/doc/source/getting_started/intro_tutorials/10_text_data.rst index 93ad35fb1960b..5b1885791d8fb 100644 --- a/doc/source/getting_started/intro_tutorials/10_text_data.rst +++ b/doc/source/getting_started/intro_tutorials/10_text_data.rst @@ -9,43 +9,14 @@ .. raw:: html
    -
    +
    Data used for this tutorial:
      -
    • - -
      -
      -

      - -This tutorial uses the Titanic data set, stored as CSV. The data -consists of the following data columns: - -- PassengerId: Id of every passenger. -- Survived: This feature have value 0 and 1. 0 for not survived and 1 - for survived. -- Pclass: There are 3 classes: Class 1, Class 2 and Class 3. -- Name: Name of passenger. -- Sex: Gender of passenger. -- Age: Age of passenger. -- SibSp: Indication that passenger have siblings and spouse. -- Parch: Whether a passenger is alone or have family. -- Ticket: Ticket number of passenger. -- Fare: Indicating the fare. -- Cabin: The cabin of passenger. -- Embarked: The embarked category. - -.. raw:: html - -

      - To raw data -
      -
      +
    • +.. include:: includes/titanic.rst .. ipython:: python @@ -54,27 +25,27 @@ consists of the following data columns: .. raw:: html -
    • -
    + +
    -How to manipulate textual data? -------------------------------- +How to manipulate textual data +------------------------------ .. raw:: html