diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 00000000..7093b531 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,29 @@ +name: Generate Docs + +on: + push: + branches: [ stable ] + +jobs: + + docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Python + uses: actions/setup-python@v1 + with: + python-version: 3.8 + + - name: Build + run: | + sudo apt-get install graphviz pandoc + python -m pip install --upgrade pip + pip install -e .[dev] + make docs + - name: Deploy + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{secrets.GITHUB_TOKEN}} + publish_dir: docs/_build/html diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 00000000..cbadf809 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,112 @@ +name: Run Tests + +on: + push: + branches: [ '*' ] + pull_request: + branches: [ master ] + +jobs: + devel: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: ['3.10'] + os: [ubuntu-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Upgrade pip + run: pip install -U "pip<=24.1" setuptools wheel + - name: Install lightfm + run: python -m pip install --no-use-pep517 'lightfm<2' + - name: Install package + run: pip install .[dev] + - name: make test-devel + run: make test-devel + + readme: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11'] + os: [ubuntu-20.04, macos-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Upgrade pip + run: pip install -U pip setuptools wheel + - name: Install lightfm + run: python -m pip install --no-use-pep517 'lightfm<2' + - name: Install package and dependencies + run: pip install rundoc .[mlprimitives] + - name: make test-readme + run: make test-readme + + unit: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] + os: [ubuntu-20.04, macos-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install package and dependencies + run: pip install .[unit] + - name: make test-unit + run: make test-unit + + unit-mlprimitives: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11'] + os: [ubuntu-20.04, macos-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Upgrade pip + run: pip install -U pip setuptools wheel + - name: Install lightfm + run: python -m pip install --no-use-pep517 'lightfm<2' + - name: Install package and dependencies + run: pip install .[test] + - name: make test-mlprimitives + run: make test-mlprimitives + + tutorials: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11'] + os: [ubuntu-20.04] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - if: matrix.os == 'ubuntu-20.04' + name: Install dependencies - Ubuntu + run: sudo apt-get install graphviz + - name: Upgrade pip + run: pip install -U pip setuptools wheel + - name: Install lightfm + run: python -m pip install --no-use-pep517 'lightfm<2' + - name: Install package and dependencies + run: pip install .[examples] + - name: make test-tutorials + run: make test-tutorials diff --git a/.gitignore b/.gitignore index cbc1f8c1..037d677e 100644 --- a/.gitignore +++ b/.gitignore @@ -64,6 +64,7 @@ instance/ # Sphinx documentation docs/_build/ +docs/pipeline.json # PyBuilder target/ @@ -108,3 +109,4 @@ ENV/ .*.swp mlblocks/data +examples/tutorials/pipeline.pkl diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 136bd690..00000000 --- a/.travis.yml +++ /dev/null @@ -1,27 +0,0 @@ -# Config file for automatic testing at travis-ci.org -language: python -python: - - 3.6 - - 3.5 - -# Command to install dependencies -install: - - pip install -U tox-travis codecov - - sudo apt-get install graphviz - -# Command to run tests -script: tox - -after_success: codecov - -deploy: - - - provider: pages - skip-cleanup: true - github-token: "$GITHUB_TOKEN" - keep-history: true - local-dir: docs/_build/html - target-branch: gh-pages - on: - branch: master - python: 3.6 diff --git a/AUTHORS.rst b/AUTHORS.rst index eb8885c9..7245c735 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -10,3 +10,4 @@ Contributors * William Xue * Akshay Ravikumar * Laura Gustafson +* Erica Chiu diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 2db74080..43acf3a0 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -15,7 +15,7 @@ Types of Contributions Report Bugs ~~~~~~~~~~~ -Report bugs at https://github.com/HDI-Project/MLBlocks/issues. +Report bugs at https://github.com/MLBazaar/MLBlocks/issues. If you are reporting a bug, please include: @@ -45,7 +45,7 @@ articles, and such. Submit Feedback ~~~~~~~~~~~~~~~ -The best way to send feedback is to file an issue at https://github.com/HDI-Project/MLBlocks/issues. +The best way to send feedback is to file an issue at https://github.com/MLBazaar/MLBlocks/issues. If you are proposing a feature: @@ -120,8 +120,8 @@ Before you submit a pull request, check that it meets these guidelines: 4. If the pull request adds functionality, the docs should be updated. Put your new functionality into a function with a docstring, and add the feature to the list in README.rst. -5. The pull request should work for Python2.7, 3.4, 3.5 and 3.6. Check - https://travis-ci.org/HDI-Project/MLBlocks/pull_requests +5. The pull request should work for all the supported python version. Check + https://travis-ci.org/MLBazaar/MLBlocks/pull_requests and make sure that all the checks pass. Unit Testing Guidelines @@ -172,24 +172,56 @@ The process of releasing a new version involves several steps combining both ``g 1. Merge what is in ``master`` branch into ``stable`` branch. 2. Update the version in ``setup.cfg``, ``mlblocks/__init__.py`` and ``HISTORY.md`` files. -3. Create a new TAG pointing at the correspoding commit in ``stable`` branch. +3. Create a new git tag pointing at the corresponding commit in ``stable`` branch. 4. Merge the new commit from ``stable`` into ``master``. -5. Update the version in ``setup.cfg`` and ``mlblocks/__init__.py`` to open the next - development interation. +5. Update the version in ``setup.cfg`` and ``mlblocks/__init__.py`` + to open the next development iteration. -**Note:** Before starting the process, make sure that ``HISTORY.md`` has a section titled -**Unreleased** with the list of changes that will be included in the new version, and that -these changes are committed and available in ``master`` branch. -Normally this is just a list of the Pull Requests that have been merged since the latest version. +.. note:: Before starting the process, make sure that ``HISTORY.md`` has been updated with a new + entry that explains the changes that will be included in the new version. + Normally this is just a list of the Pull Requests that have been merged to master + since the last release. -Once this is done, just run the following commands:: +Once this is done, run of the following commands: + +1. If you are releasing a patch version:: - git checkout stable - git merge --no-ff master # This creates a merge commit - bumpversion release # This creates a new commit and a TAG - git push --tags origin stable make release - git checkout master - git merge stable - bumpversion --no-tag patch - git push + +2. If you are releasing a minor version:: + + make release-minor + +3. If you are releasing a major version:: + + make release-major + +Release Candidates +~~~~~~~~~~~~~~~~~~ + +Sometimes it is necessary or convenient to upload a release candidate to PyPi as a pre-release, +in order to make some of the new features available for testing on other projects before they +are included in an actual full-blown release. + +In order to perform such an action, you can execute:: + + make release-candidate + +This will perform the following actions: + +1. Build and upload the current version to PyPi as a pre-release, with the format ``X.Y.Z.devN`` + +2. Bump the current version to the next release candidate, ``X.Y.Z.dev(N+1)`` + +After this is done, the new pre-release can be installed by including the ``dev`` section in the +dependency specification, either in ``setup.py``:: + + install_requires = [ + ... + 'mlblocks>=X.Y.Z.dev', + ... + ] + +or in command line:: + + pip install 'mlblocks>=X.Y.Z.dev' diff --git a/HISTORY.md b/HISTORY.md index d08624dc..97c363f3 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,79 @@ Changelog ========= +0.6.2 - 2024-11-18 +------------------ + +* Upgrade python version to include 3.12 and 3.13 - [Issue #144](https://github.com/MLBazaar/MLBlocks/issues/144) by @sarahmish + +0.6.1 - 2023-09-26 +------------------ + +* Add python 3.11 to MLBlocks - [Issue #143](https://github.com/MLBazaar/MLBlocks/issues/143) by @sarahmish + +0.6.0 - 2023-04-14 +------------------ + +* Support python 3.9 and 3.10 - [Issue #141](https://github.com/MLBazaar/MLBlocks/issues/141) by @sarahmish + +0.5.0 - 2023-01-22 +------------------ + +* Update `numpy` dependency and isolate tests - [Issue #139](https://github.com/MLBazaar/MLBlocks/issues/139) by @sarahmish + +0.4.1 - 2021-10-08 +------------------ + +* Update NumPy dependency - [Issue #136](https://github.com/MLBazaar/MLBlocks/issues/136) by @sarahmish +* Support dynamic inputs and outputs - [Issue #134](https://github.com/MLBazaar/MLBlocks/issues/134) by @pvk-developer + +0.4.0 - 2021-01-09 +------------------ + +* Stop pipeline fitting after the last block - [Issue #131](https://github.com/MLBazaar/MLBlocks/issues/131) by @sarahmish +* Add memory debug and profiling - [Issue #130](https://github.com/MLBazaar/MLBlocks/issues/130) by @pvk-developer +* Update Python support - [Issue #129](https://github.com/MLBazaar/MLBlocks/issues/129) by @csala +* Get execution time for each block - [Issue #127](https://github.com/MLBazaar/MLBlocks/issues/127) by @sarahmish +* Allow loading a primitive or pipeline directly from the JSON path - [Issue #114](https://github.com/MLBazaar/MLBlocks/issues/114) by @csala +* Pipeline Diagrams - [Issue #113](https://github.com/MLBazaar/MLBlocks/issues/113) by @erica-chiu +* Get Pipeline Inputs - [Issue #112](https://github.com/MLBazaar/MLBlocks/issues/112) by @erica-chiu + +0.3.4 - 2019-11-01 +------------------ + +* Ability to return intermediate context - [Issue #110](https://github.com/MLBazaar/MLBlocks/issues/110) by @csala +* Support for static or class methods - [Issue #107](https://github.com/MLBazaar/MLBlocks/issues/107) by @csala + +0.3.3 - 2019-09-09 +------------------ + +* Improved intermediate outputs management - [Issue #105](https://github.com/MLBazaar/MLBlocks/issues/105) by @csala + +0.3.2 - 2019-08-12 +------------------ + +* Allow passing fit and produce arguments as `init_params` - [Issue #96](https://github.com/MLBazaar/MLBlocks/issues/96) by @csala +* Support optional fit and produce args and arg defaults - [Issue #95](https://github.com/MLBazaar/MLBlocks/issues/95) by @csala +* Isolate primitives from their hyperparameters dictionary - [Issue #94](https://github.com/MLBazaar/MLBlocks/issues/94) by @csala +* Add functions to explore the available primitives and pipelines - [Issue #90](https://github.com/MLBazaar/MLBlocks/issues/90) by @csala +* Add primitive caching - [Issue #22](https://github.com/MLBazaar/MLBlocks/issues/22) by @csala + +0.3.1 - Pipelines Discovery +--------------------------- + +* Support flat hyperparameter dictionaries - [Issue #92](https://github.com/MLBazaar/MLBlocks/issues/92) by @csala +* Load pipelines by name and register them as `entry_points` - [Issue #88](https://github.com/MLBazaar/MLBlocks/issues/88) by @csala +* Implement partial re-fit -[Issue #61](https://github.com/MLBazaar/MLBlocks/issues/61) by @csala +* Move argument parsing to MLBlock - [Issue #86](https://github.com/MLBazaar/MLBlocks/issues/86) by @csala +* Allow getting intermediate outputs - [Issue #58](https://github.com/MLBazaar/MLBlocks/issues/58) by @csala + +0.3.0 - New Primitives Discovery +-------------------------------- + +* New primitives discovery system based on `entry_points`. +* Conditional Hyperparameters filtering in MLBlock initialization. +* Improved logging and exception reporting. + 0.2.4 - New Datasets and Unit Tests ----------------------------------- diff --git a/Makefile b/Makefile index dc62e90d..4fa8cc04 100644 --- a/Makefile +++ b/Makefile @@ -72,6 +72,14 @@ clean: clean-build clean-pyc clean-test clean-coverage clean-docs ## remove all install: clean-build clean-pyc ## install the package to the active Python's site-packages pip install . +.PHONY: install-examples +install-examples: clean-build clean-pyc ## install the package and the examples dependencies + pip install .[examples] + +.PHONY: install-unit +install-unit: clean-build clean-pyc ## install the package and dependencies for unit tests + pip install .[unit] + .PHONY: install-test install-test: clean-build clean-pyc ## install the package and test dependencies pip install .[test] @@ -80,6 +88,12 @@ install-test: clean-build clean-pyc ## install the package and test dependencies install-develop: clean-build clean-pyc ## install the package in editable mode and dependencies for development pip install -e .[dev] +MINIMUM := $(shell sed -n '/install_requires = \[/,/]/p' setup.py | grep -v -e '[][]' | sed 's/ *\(.*\),$?$$/\1/g' | tr '>' '=') + +.PHONY: install-minimum +install-minimum: ## install the minimum supported versions of the package dependencies + pip install $(MINIMUM) + # LINT TARGETS @@ -98,16 +112,46 @@ fix-lint: ## fix lint issues using autoflake, autopep8, and isort autopep8 --in-place --recursive --aggressive tests isort --apply --atomic --recursive tests +.PHONY: lint-docs +lint-docs: ## check docs formatting with doc8 and pydocstyle + doc8 mlblocks/ + pydocstyle mlblocks/ + # TEST TARGETS -.PHONY: test -test: ## run tests quickly with the default Python +.PHONY: test-unit +test-unit: ## run tests quickly with the default Python + python -m pytest --cov=mlblocks --ignore=tests/features/ + +.PHONY: test-mlprimitives +test-mlprimitives: ## run tests quickly with the default Python python -m pytest --cov=mlblocks +.PHONY: test-readme +test-readme: ## run the readme snippets + rm -rf tests/readme_test && mkdir tests/readme_test + cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md + rm -rf tests/readme_test + +.PHONY: test-tutorials +test-tutorials: ## run the tutorial notebooks + find examples/tutorials -path "*/.ipynb_checkpoints" -prune -false -o -name "*.ipynb" -exec \ + jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --stdout --to html {} > /dev/null + + +.PHONY: test +test: test-unit test-mlprimitives test-readme ## test everything that needs test dependencies + +.PHONY: check-dependencies +check-dependencies: ## test if there are any broken dependencies + pip check + +.PHONY: test-devel +test-devel: check-dependencies lint docs ## test everything that needs development dependencies + .PHONY: test-all test-all: ## run tests on every Python version with tox - tox + tox -r .PHONY: coverage coverage: ## check code coverage quickly with the default Python @@ -122,14 +166,13 @@ coverage: ## check code coverage quickly with the default Python .PHONY: docs docs: clean-docs ## generate Sphinx HTML documentation, including API docs $(MAKE) -C docs html - touch docs/_build/html/.nojekyll .PHONY: view-docs -view-docs: docs ## view docs in browser +view-docs: ## view the docs in a browser $(BROWSER) docs/_build/html/index.html .PHONY: serve-docs -serve-docs: view-docs ## compile the docs watching for changes +serve-docs: ## compile the docs watching for changes watchmedo shell-command -W -R -D -p '*.rst;*.md' -c '$(MAKE) -C docs html' docs @@ -141,17 +184,24 @@ dist: clean ## builds source and wheel package python setup.py bdist_wheel ls -l dist -.PHONY: test-publish -test-publish: dist ## package and upload a release on TestPyPI +.PHONY: publish-confirm +publish-confirm: + @echo "WARNING: This will irreversibly upload a new version to PyPI!" + @echo -n "Please type 'confirm' to proceed: " \ + && read answer \ + && [ "$${answer}" = "confirm" ] + +.PHONY: publish-test +publish-test: dist publish-confirm ## package and upload a release on TestPyPI twine upload --repository-url https://test.pypi.org/legacy/ dist/* .PHONY: publish -publish: dist ## package and upload a release +publish: dist publish-confirm ## package and upload a release twine upload dist/* .PHONY: bumpversion-release bumpversion-release: ## Merge master to stable and bumpversion release - git checkout stable + git checkout stable || git checkout -b stable git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable" bumpversion release git push --tags origin stable @@ -163,6 +213,10 @@ bumpversion-patch: ## Merge stable to master and bumpversion patch bumpversion --no-tag patch git push +.PHONY: bumpversion-candidate +bumpversion-candidate: ## Bump the version to the next candidate + bumpversion candidate --no-tag + .PHONY: bumpversion-minor bumpversion-minor: ## Bump the version the next minor skipping the release bumpversion --no-tag minor @@ -171,23 +225,49 @@ bumpversion-minor: ## Bump the version the next minor skipping the release bumpversion-major: ## Bump the version the next major skipping the release bumpversion --no-tag major -CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD) -CHANGELOG_LINES := $(shell git diff HEAD..stable HISTORY.md | wc -l) +.PHONY: bumpversion-revert +bumpversion-revert: ## Undo a previous bumpversion-release + git checkout master + git branch -D stable + +CLEAN_DIR := $(shell git status --short | grep -v ??) +CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) +CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l) -.PHONY: check-release -check-release: ## Check if the release can be made +.PHONY: check-clean +check-clean: ## Check if the directory has uncommitted changes +ifneq ($(CLEAN_DIR),) + $(error There are uncommitted changes) +endif + +.PHONY: check-master +check-master: ## Check if we are in master branch ifneq ($(CURRENT_BRANCH),master) $(error Please make the release from master branch\n) endif + +.PHONY: check-history +check-history: ## Check if HISTORY.md has been modified ifeq ($(CHANGELOG_LINES),0) $(error Please insert the release notes in HISTORY.md before releasing) -else - @echo "A new release can be made" endif +.PHONY: check-release +check-release: check-clean check-master check-history ## Check if the release can be made + @echo "A new release can be made" + .PHONY: release release: check-release bumpversion-release publish bumpversion-patch +.PHONY: release-test +release-test: check-release bumpversion-release-test publish-test bumpversion-revert + +.PHONY: release-candidate +release-candidate: check-master publish bumpversion-candidate + +.PHONY: release-candidate-test +release-candidate-test: check-clean check-master publish-test + .PHONY: release-minor release-minor: check-release bumpversion-minor release diff --git a/README.md b/README.md index fb8d3885..fb5ba341 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,35 @@ -

-“MLBlocks” +

+ + DAI-Lab + + An Open Source Project from the Data to AI Lab, at MIT

-

- +

+“MLBlocks” +

+ +

Pipelines and Primitives for Machine Learning and Data Science. -

-[![PyPi][pypi-img]][pypi-url] -[![Travis][travis-img]][travis-url] -[![CodeCov][codecov-img]][codecov-url] +[![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) +[![PyPi](https://img.shields.io/pypi/v/mlblocks.svg)](https://pypi.python.org/pypi/mlblocks) +[![Tests](https://github.com/MLBazaar/MLBlocks/workflows/Run%20Tests/badge.svg)](https://github.com/MLBazaar/MLBlocks/actions?query=workflow%3A%22Run+Tests%22+branch%3Amaster) +[![CodeCov](https://codecov.io/gh/MLBazaar/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/MLBazaar/MLBlocks) +[![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks) +[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/MLBazaar/MLBlocks/master?filepath=examples/tutorials) + +
-[pypi-img]: https://img.shields.io/pypi/v/mlblocks.svg -[pypi-url]: https://pypi.python.org/pypi/mlblocks -[travis-img]: https://travis-ci.org/HDI-Project/MLBlocks.svg?branch=master -[travis-url]: https://travis-ci.org/HDI-Project/MLBlocks -[codecov-img]: https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg -[codecov-url]: https://codecov.io/gh/HDI-Project/MLBlocks +# MLBlocks -* Free software: MIT license -* Documentation: https://HDI-Project.github.io/MLBlocks +* Documentation: https://mlbazaar.github.io/MLBlocks +* Github: https://github.com/MLBazaar/MLBlocks +* License: [MIT](https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE) +* Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) -# Overview +## Overview MLBlocks is a simple framework for composing end-to-end tunable Machine Learning Pipelines by seamlessly combining tools from any python library with a simple, common and uniform interface. @@ -34,142 +41,112 @@ Features include: no python code to write, carefully curated by Machine Learning and Domain experts. * Extract machine-readable information about which hyperparameters can be tuned and within which ranges, allowing automated integration with Hyperparameter Optimization tools like - [BTB](https://github.com/HDI-Project/BTB). + [BTB](https://github.com/MLBazaar/BTB). * Complex multi-branch pipelines and DAG configurations, with unlimited number of inputs and outputs per primitive. * Easy save and load Pipelines using JSON Annotations. -# Installation +# Install + +## Requirements + +**MLBlocks** has been developed and tested on [Python 3.8, 3.9, 3.10, 3.11, 3.12, 3.13](https://www.python.org/downloads/) + +## Install with `pip` -The simplest and recommended way to install MLBlocks is using `pip`: +The easiest and recommended way to install **MLBlocks** is using [pip]( +https://pip.pypa.io/en/stable/): ```bash pip install mlblocks ``` -Alternatively, you can also clone the repository and install it from sources +This will pull and install the latest stable release from [PyPi](https://pypi.org/). + +If you want to install from source or contribute to the project please read the +[Contributing Guide](https://mlbazaar.github.io/MLBlocks/contributing.html#get-started). + +## MLPrimitives + +In order to be usable, MLBlocks requires a compatible primitives library. + +The official library, required in order to follow the following MLBlocks tutorial, +is [MLPrimitives](https://github.com/MLBazaar/MLPrimitives), which you can install +with this command: ```bash -git clone git@github.com:HDI-Project/MLBlocks.git -cd MLBlocks -make install +pip install mlprimitives ``` -For development, you can use `make install-develop` instead in order to install all -the required dependencies for testing and code linting. +# Quickstart -# Usage Example +Below there is a short example about how to use **MLBlocks** to solve the [Adult Census +Dataset](https://archive.ics.uci.edu/ml/datasets/Adult) classification problem using a +pipeline which combines primitives from [MLPrimitives](https://github.com/MLBazaar/MLPrimitives), +[scikit-learn](https://scikit-learn.org/) and [xgboost](https://xgboost.readthedocs.io/). -Below there is a short example about how to use MLBlocks to create a simple pipeline, fit it -using demo data and use it to make predictions. +```python3 +import pandas as pd +from mlblocks import MLPipeline +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score -For advance usage and more detailed explanation about each component, please have a look -at the [documentation](https://HDI-Project.github.io/MLBlocks) +dataset = pd.read_csv('http://mlblocks.s3.amazonaws.com/census.csv') +label = dataset.pop('label') -## Creating a pipeline +X_train, X_test, y_train, y_test = train_test_split(dataset, label, stratify=label) -With MLBlocks, creating a pipeline is as simple as specifying a list of primitives and passing -them to the `MLPipeline` class. +primitives = [ + 'mlprimitives.custom.preprocessing.ClassEncoder', + 'mlprimitives.custom.feature_extraction.CategoricalEncoder', + 'sklearn.impute.SimpleImputer', + 'xgboost.XGBClassifier', + 'mlprimitives.custom.preprocessing.ClassDecoder' +] +pipeline = MLPipeline(primitives) -```python ->>> from mlblocks import MLPipeline -... primitives = [ -... 'cv2.GaussianBlur', -... 'skimage.feature.hog', -... 'sklearn.ensemble.RandomForestClassifier' -... ] ->>> pipeline = MLPipeline(primitives) -``` +pipeline.fit(X_train, y_train) +predictions = pipeline.predict(X_test) -Optionally, specific hyperparameters can be also set by specifying them in a dictionary: - -```python ->>> hyperparameters = { -... 'skimage.feature.hog': { -... 'multichannel': True, -... 'visualize': False -... }, -... 'sklearn.ensemble.RandomForestClassifier': { -... 'n_estimators': 100, -... } -... } ->>> pipeline = MLPipeline(primitives, hyperparameters) +accuracy_score(y_test, predictions) ``` -If you can see which hyperparameters a particular pipeline is using, you can do so by calling -its `get_hyperparameters` method: - -```python ->>> import json ->>> hyperparameters = pipeline.get_hyperparameters() ->>> print(json.dumps(hyperparameters, indent=4)) -{ - "cv2.GaussianBlur#1": { - "ksize_width": 3, - "ksize_height": 3, - "sigma_x": 0, - "sigma_y": 0 - }, - "skimage.feature.hog#1": { - "multichannel": true, - "visualize": false, - "orientations": 9, - "pixels_per_cell_x": 8, - "pixels_per_cell_y": 8, - "cells_per_block_x": 3, - "cells_per_block_y": 3, - "block_norm": null - }, - "sklearn.ensemble.RandomForestClassifier#1": { - "n_jobs": -1, - "n_estimators": 100, - "criterion": "entropy", - "max_features": null, - "max_depth": 10, - "min_samples_split": 0.1, - "min_samples_leaf": 0.1, - "class_weight": null - } -} -``` +# What's Next? -### Making predictions +If you want to learn more about how to tune the pipeline hyperparameters, save and load +the pipelines using JSON annotations or build complex multi-branched pipelines, please +check our [documentation site](https://mlbazaar.github.io/MLBlocks). -Once we have created the pipeline with the desired hyperparameters we can fit it -and then use it to make predictions on new data. +Also do not forget to have a look at the [notebook tutorials]( +https://github.com/MLBazaar/MLBlocks/tree/master/examples/tutorials)! -To do this, we first call the `fit` method passing the training data and the corresponding labels. +# Citing MLBlocks -In this case in particular, we will be loading the handwritten digit classification dataset -from USPS using the `mlblocks.datasets.load_usps` method, which returns a dataset object -ready to be played with. +If you use MLBlocks for your research, please consider citing our related papers. -```python ->>> from mlblocks.datasets import load_usps ->>> dataset = load_usps() ->>> X_train, X_test, y_train, y_test = dataset.get_splits(1) ->>> pipeline.fit(X_train, y_train) -``` +For the current design of MLBlocks and its usage within the larger *Machine Learning Bazaar* project at +the MIT Data To AI Lab, please see: -Once we have fitted our model to our data, we can call the `predict` method passing new data -to obtain predictions from the pipeline. +Micah J. Smith, Carles Sala, James Max Kanter, and Kalyan Veeramachaneni. ["The Machine Learning Bazaar: +Harnessing the ML Ecosystem for Effective System Development."](https://arxiv.org/abs/1905.08942) arXiv +Preprint 1905.08942. 2019. -```python ->>> predictions = pipeline.predict(X_test) ->>> predictions -array([3, 2, 1, ..., 1, 1, 2]) +```bibtex +@article{smith2019mlbazaar, + author = {Smith, Micah J. and Sala, Carles and Kanter, James Max and Veeramachaneni, Kalyan}, + title = {The Machine Learning Bazaar: Harnessing the ML Ecosystem for Effective System Development}, + journal = {arXiv e-prints}, + year = {2019}, + eid = {arXiv:1905.08942}, + pages = {arXiv:1905.08942}, + archivePrefix = {arXiv}, + eprint = {1905.08942}, +} ``` -## What's Next? - -If you want to learn more about how to tune the pipeline hyperparameters, save and load -the pipelines using JSON annotations or build complex multi-branched pipelines, please -check our [documentation](https://HDI-Project.github.io/MLBlocks). - -# History +For the first MLBlocks version from 2015, designed for only multi table, multi entity temporal data, please +refer to Bryan Collazo’s thesis: -In its first iteration in 2015, MLBlocks was designed for only multi table, multi entity temporal -data. A good reference to see our design rationale at that time is Bryan Collazo’s thesis: * [Machine learning blocks](https://dai.lids.mit.edu/wp-content/uploads/2018/06/Mlblocks_Bryan.pdf). Bryan Collazo. Masters thesis, MIT EECS, 2015. diff --git a/apt.txt b/apt.txt new file mode 100644 index 00000000..65387721 --- /dev/null +++ b/apt.txt @@ -0,0 +1,3 @@ +# apt-get requirements for development and mybinder environment +graphviz +pandoc diff --git a/docs/advanced_usage/adding_primitives.rst b/docs/advanced_usage/adding_primitives.rst index fc2e81b9..5ad0b60b 100644 --- a/docs/advanced_usage/adding_primitives.rst +++ b/docs/advanced_usage/adding_primitives.rst @@ -17,8 +17,8 @@ This can be achieved by running the commands:: For further details, please refer to the `MLPrimitives Documentation`_. -.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives -.. _MLPrimitives Documentation: https://hdi-project.github.io/MLPrimitives/ +.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives +.. _MLPrimitives Documentation: https://mlbazaar.github.io/MLPrimitives/ Writing Primitives ------------------ @@ -27,9 +27,9 @@ Sometimes you will find that you want to use a primitive that is not in the list `MLPrimitives integrated primitives`_, so you will have to integrate the primitive yourself by writing the corresponding `JSON annotation `_. -.. _MLPrimitives integrated primitives: https://github.com/HDI-Project/MLPrimitives/tree/master/mlblocks_primitives +.. _MLPrimitives integrated primitives: https://github.com/MLBazaar/MLPrimitives/tree/master/mlblocks_primitives -.. note:: If you integrate new primitives for MLBlocks, please consider contributing them to the +.. note:: If you create new primitives for MLBlocks, please consider contributing them to the **MLPrimitives** project! The first thing to do when adding a new primitive is making sure that it complies with the @@ -58,8 +58,8 @@ place known to **MLBlocks**. **MLBlocks** looks for primitives in the following folders, in this order: 1. Any folder specified by the user, starting by the latest one. -2. A folder named `mlblocks_primitives` in the current working directory. -3. A folder named `mlblocks_primitives` in the `system prefix`_. +2. A folder named ``mlblocks_primitives`` or ``mlprimitives`` in the current working directory. +3. A folder named ``mlblocks_primitives`` or ``mlprimitives`` in the `system prefix`_. .. _system prefix: https://docs.python.org/3/library/sys.html#sys.prefix @@ -80,3 +80,38 @@ However, sometimes you will want to add a custom directory. This can be easily done by using the `mlblocks.add_primitives_path`_ method. .. _mlblocks.add_primitives_path: ../api_reference.html#mlblocks.add_primitives_path + +Developing a Primitives Library +------------------------------- + +Another option to add multiple libraries is creating a primitives library, such as +`MLPrimitives`_. + +In order to make **MLBLocks** able to find the primitives defined in such a library, +all you need to do is setting up an `Entry Point`_ in your `setup.py` script with the +following specification: + +1. It has to be published under the group ``mlblocks``. +2. It has to be named exactly ``primitives``. +3. It has to point at a variable that contains a path or a list of paths to the JSONS folder(s). + +An example of such an entry point would be:: + + entry_points = { + 'mlblocks': [ + 'primitives=some_module:SOME_VARIABLE' + ] + } + +where the module `some_module` contains a variable such as:: + + SOME_VARIABLE = 'path/to/primitives' + +or:: + + SOME_VARIABLE = [ + 'path/to/primitives', + 'path/to/more/primitives' + ] + +.. _Entry Point: https://packaging.python.org/specifications/entry-points/ diff --git a/docs/advanced_usage/hyperparameters.rst b/docs/advanced_usage/hyperparameters.rst index bc31d4fd..488be9a9 100644 --- a/docs/advanced_usage/hyperparameters.rst +++ b/docs/advanced_usage/hyperparameters.rst @@ -165,6 +165,19 @@ Conditional Hyperparameters In some other cases, the values that a hyperparameter can take depend on the value of another one. +For example, sometimes a primitive has a hyperparameter that specifies a kernel, and depending +on the kernel used some other hyperparameters may be or not be used, or they might be able +to take only some specific values. + +In this case, the ``type`` of the hyperparameter whose values depend on the other is specified +as ``conditional``. +In this case, two additional entries are required: + +* an entry called ``condition``, which specifies the name of the other hyperparameter, the value + of which is evaluated to decide which values this hyperparameter can take. +* an additional subdictionary called ``values``, which relates the possible values that the + `condition` hyperparameter can have with the full specifications of the type and values that + this hyperparameter can take in each case. Suppose, for example, that the primitive explained in the previous point does not expect the ``mean``, ``min`` or ``max`` strings as values for the ``max_features`` hyperparameter, @@ -190,7 +203,7 @@ In this case, the hyperparameters would be annotated like this:: } "max_features_aggregation": { "type": "conditional", - "condition": "mas_features", + "condition": "max_features", "default": null, "values": { "auto": { @@ -202,10 +215,14 @@ In this case, the hyperparameters would be annotated like this:: } } +.. note:: Just like a regular hyperparameter, if there is no match the default entry is used. + In this example, the ``null`` value indicates that the hyperparameter needs to be + disabled if there is no match, but instead of it we could add there a full specification + of type, range and default value as a nested dictionary to be used by default. .. _JSON Annotations: primitives.html#json-annotations -.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives -.. _BTB: https://github.com/HDI-Project/BTB +.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives +.. _BTB: https://github.com/MLBazaar/BTB .. _MLPipeline: ../api_reference.html#mlblocks.MLPipeline .. _multitype: #multitype-hyperparameters .. _conditional: #conditional-hyperparameters diff --git a/docs/advanced_usage/pipelines.rst b/docs/advanced_usage/pipelines.rst index cc7ccc49..07b36c98 100644 --- a/docs/advanced_usage/pipelines.rst +++ b/docs/advanced_usage/pipelines.rst @@ -86,7 +86,7 @@ This can be done by passing an extra dictionary to the MLPipeline when it is cre 'n_estimators': 100 } } - pipeline = MLPipeline(primitives, init_params) + pipeline = MLPipeline(primitives, init_params=init_params) This dictionary must have as keys the name of the blocks that the arguments belong to, and as values the dictionary that contains the argument names and their values. @@ -271,7 +271,7 @@ Like primitives, Pipelines can also be annotated and stored as dicts or JSON fil the different arguments expected by the ``MLPipeline`` class, as well as the set hyperparameters and tunable hyperparameters. -Representing a Pipeline as a dict +Representing a Pipeline as a dict ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The dict representation of an Pipeline can be obtained directly from an ``MLPipeline`` instance, @@ -344,6 +344,122 @@ that allows loading the pipeline directly from a JSON file: pipeline = MLPipeline.load('pipeline.json') + +Intermediate Outputs and Partial Execution +------------------------------------------ + +Sometimes we might be interested in capturing an intermediate output within a +pipeline execution in order to inspect it, for debugging purposes, or to reuse +it later on in order to speed up a tuning process where the pipeline needs +to be executed multiple times over the same data. + +For this, two special arguments have been included in the ``fit`` and ``predict`` +methods of an MLPipeline: + +output\_ +~~~~~~~~ + +The ``output_`` argument indicates which block within the pipeline we are interested +in taking the output values from. This, implicitly, indicates up to which block the +pipeline needs to be executed within ``fit`` and ``predict`` before returning. + +The ``output_`` argument is optional, and it can either be ``None``, which is the default, +and Integer or a String. + +And its format is as follows: + +* If it is ``None`` (default), the ``fit`` method will return nothing and the + ``predict`` method will return the output of the last block in the pipeline. +* If an integer is given, it is interpreted as the block index, starting on 0, + and the whole context after executing the specified block will be returned. + In case of ``fit``, this means that the outputs will be returned after fitting + a block and then producing it on the same data. +* If it is a string, it can be interpreted in three ways: + + * **block name**: If the string matches a block name exactly, including + its hash and counter number ``#n`` at the end, the whole context will be + returned after that block is produced. + * **variable_name**: If the string does not match any block name and does + not contain any dot character, ``'.'``, it will be considered a variable + name. In this case, the indicated variable will be extracted from the + context and returned after the last block has been produced. + * **block_name + variable_name**: If the complete string does not match a + block name but it contains at least one dot, ``'.'``, it will be split + in two parts on the last dot. If the first part of the string matches a + block name exactly, the second part of the string will be considered a + variable name, assuming the format ``{block_name}.{variable_name}``, and + the indicated variable will be extracted from the context and returned + after the block has been produced. Otherwise, if the extracted + ``block_name`` does not match a block name exactly, a ``ValueError`` + will be raised. + +start\_ +~~~~~~~ + +The ``start_`` argument indicates which block within the pipeline we are interested +in starting the computation from when executing ``fit`` and ``predict``, allowing us +to skip some of the initial blocks. + +The ``start_`` argument is optional, and it can either be ``None``, which is the default, +and Integer or a String. + +And its format is as follows: + +* If it is ``None``, the execution will start on the first block. +* If it is an integer, it is interpreted as the block index +* If it is a string, it is expected to be the name of the block, including the counter + number at the end. + +This is specially useful when used in combination with the ``output_`` argument, as it +effectively allows us to both capture intermediate outputs for debugging purposes or +reusing intermediate states of the pipeline to accelerate tuning processes. + +An example of this situation, where we want to reuse the output of the first block, could be:: + + context_0 = pipeline.fit(X_train, y_train, output_=0) + + # Afterwards, within the tuning loop + pipeline.fit(start_=1, **context_0) + predictions = pipeline.predict(X_test) + score = compute_score(y_test, predictions) + +Pipeline debugging +------------------ + +Sometimes we might be interested in debugging a pipeline execution and obtain information +about the time, the memory usage, the inputs and outputs that each step takes. This is possible +by using the argument ``debug`` with the method ``fit`` and ``predict``. This argument allows us +to retrieve critical information from the pipeline execution: + +* ``Time``: Elapsed time for the primitive and the given stage (fit or predict). +* ``Memory``: Amount of memory increase or decrease for the given primitive for that pipeline. +* ``Input``: The input values that the primitive takes for that specific step. +* ``Output``: The output produced by the primitive. + + +If the ``debug`` argument is set to ``True`` then a dictionary will be returned containing all the +elements listed previously:: + + result, debug_info = pipeline.fit(X_train, y_train, debug=True) + +In case you want to retrieve only some of the elements listed above and skip the rest, you can +pass an ``str`` to the ``debug`` argument with any combination of the following characters: + +* ``i``: To include inputs. +* ``o``: To include outputs. +* ``m``: To include used memory. +* ``t``: To include elapsed time. + +For example, if we are only interested on capturing the elapsed time and used memory during the +``fit`` process, we can call the method as follows:: + + result, debug_info = pipeline.fit(X_train, y_train, debug='tm') + +.. warning:: Bear in mind that if we use ``debug=True`` or saving the ``Input`` and ``Output``, + this will consume extra memory ram as it will create copies of the input data and + the output data for each primitive. For profiling it is recommended using the option + ``tm`` as shown in the previous example. + .. _API Reference: ../api_reference.html .. _primitives: ../primitives.html .. _mlblocks.MLPipeline: ../api_reference.html#mlblocks.MLPipeline diff --git a/docs/advanced_usage/primitives.rst b/docs/advanced_usage/primitives.rst index 58847bbe..37df9031 100644 --- a/docs/advanced_usage/primitives.rst +++ b/docs/advanced_usage/primitives.rst @@ -311,11 +311,11 @@ For a more detailed description of this class, please check the corresponding section in the `API Reference`_ documentation. .. _API Reference: ../api_reference.html -.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives -.. _keras.preprocessing.text.Tokenizer: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/keras.preprocessing.text.Tokenizer.json +.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives +.. _keras.preprocessing.text.Tokenizer: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.preprocessing.text.Tokenizer.json .. _hyperparameters: hyperparameters.html .. _mlblocks.MLBlock: ../api_reference.html#mlblocks.MLBlock .. _pipelines: pipelines.html -.. _examples folder: https://github.com/HDI-Project/MLBlocks/tree/master/examples +.. _examples folder: https://github.com/MLBazaar/MLBlocks/tree/master/examples .. _fit: ../api_reference.html#mlblocks.MLBlock.fit .. _produce: ../api_reference.html#mlblocks.MLBlock.produce diff --git a/docs/api/mlblocks.datasets.rst b/docs/api/mlblocks.datasets.rst deleted file mode 100644 index 6661cd8a..00000000 --- a/docs/api/mlblocks.datasets.rst +++ /dev/null @@ -1,5 +0,0 @@ -mlblocks.datasets -================= - -.. automodule:: mlblocks.datasets - :members: diff --git a/docs/changelog.rst b/docs/changelog.rst index fcd2eb2d..d26e5be8 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1 +1 @@ -.. include:: ../HISTORY.md +.. mdinclude:: ../HISTORY.md diff --git a/docs/conf.py b/docs/conf.py index 8659996f..f81b7b7e 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,37 +18,34 @@ # relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. -import os -import sys - import sphinx_rtd_theme # For read the docs theme -from recommonmark.parser import CommonMarkParser -# from recommonmark.transform import AutoStructify - -# sys.path.insert(0, os.path.abspath('..')) import mlblocks -# -# mlblocks.add_primitives_path('../mlblocks_primitives') # -- General configuration --------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' - # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ - 'sphinx.ext.napoleon', + 'm2r', + 'sphinx.ext.autodoc', 'sphinx.ext.githubpages', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', 'sphinx.ext.graphviz', 'IPython.sphinxext.ipython_console_highlighting', 'IPython.sphinxext.ipython_directive', + 'autodocsumm', ] -ipython_execlines = ["import pandas as pd", "pd.set_option('display.width', 1000000)"] +autodoc_default_options = { + 'autosummary': True, +} +ipython_execlines = ["import pandas as pd", "pd.set_option('display.width', 1000000)"] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -56,10 +53,6 @@ # You can specify multiple suffix as a list of string: source_suffix = ['.rst', '.md', '.ipynb'] -source_parsers = { - '.md': CommonMarkParser, -} - # The master toctree document. master_doc = 'index' @@ -70,7 +63,7 @@ copyright = '2018, MIT Data To AI Lab' author = 'MIT Data To AI Lab' description = 'Pipelines and Primitives for Machine Learning and Data Science.' -user = 'HDI-Project' +user = 'MLBazaar' # The version info for the project you're documenting, acts as replacement # for |version| and |release|, also used in various other places throughout @@ -121,7 +114,7 @@ # documentation. html_theme_options = { 'collapse_navigation': False, - 'display_version': False, + 'display_version': True, } # Add any paths that contain custom static files (such as style sheets) here, diff --git a/docs/getting_started/install.rst b/docs/getting_started/install.rst index 4163f3bd..d64970a2 100644 --- a/docs/getting_started/install.rst +++ b/docs/getting_started/install.rst @@ -18,40 +18,26 @@ you through the process. .. _pip: https://pip.pypa.io .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ -From sources ------------- +Additional dependencies +----------------------- -The sources for MLBlocks can be downloaded from the `Github repo`_. +In order to be usable, MLBlocks requires a compatible primitives library. -You can either clone the public repository: +The official library, required in order to follow the MLBlocks tutorials and documentation examples, +is `MLPrimitives`_, which you can install with this command: .. code-block:: console - git clone git://github.com/HDI-Project/MLBlocks + pip install mlprimitives -Or download the `tarball`_: +.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives -.. code-block:: console - - curl -OL https://github.com/HDI-Project/MLBlocks/tarball/master - -Once you have a copy of the source, you can install it running the next command inside the -project folder: - -.. code-block:: console - - $ make install - -.. _Github repo: https://github.com/HDI-Project/MLBlocks -.. _tarball: https://github.com/HDI-Project/MLBlocks/tarball/master - -Development ------------ +Install for development +----------------------- If you are installing **MLBlocks** in order to modify its code, the installation must be done from its sources, in the editable mode, and also including some additional dependencies in -order to be able to run the tests and build the documentation: - -.. code-block:: console +order to be able to run the tests and build the documentation. Instructions about this process +can be found in the `Contributing guide`_. - make install-develop +.. _Contributing guide: ../contributing.html#get-started diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst index 2e00ece6..55c20d86 100644 --- a/docs/getting_started/quickstart.rst +++ b/docs/getting_started/quickstart.rst @@ -24,21 +24,25 @@ them to the `MLPipeline class`_: from mlblocks import MLPipeline primitives = [ - 'mlprimitives.feature_extraction.StringVectorizer', - 'sklearn.ensemble.RandomForestClassifier', + 'mlprimitives.custom.preprocessing.ClassEncoder', + 'mlprimitives.custom.feature_extraction.CategoricalEncoder', + 'sklearn.impute.SimpleImputer', + 'xgboost.XGBClassifier', + 'mlprimitives.custom.preprocessing.ClassDecoder' ] pipeline = MLPipeline(primitives) -Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary: +Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary and +passing them as the ``init_params`` argument: .. ipython:: python - hyperparameters = { - 'sklearn.ensemble.RandomForestClassifier': { - 'n_estimators': 100 + init_params = { + 'sklearn.impute.SimpleImputer': { + 'strategy': 'median' } } - pipeline = MLPipeline(primitives, hyperparameters) + pipeline = MLPipeline(primitives, init_params=init_params) Once the pipeline has been instantiated, we can easily see what `hyperparameters`_ have been set for each block, by calling the `get_hyperparameters method`_. @@ -80,13 +84,13 @@ other ones will remain unmodified. .. ipython:: python new_hyperparameters = { - 'sklearn.ensemble.RandomForestClassifier#1': { + 'xgboost.XGBClassifier#1': { 'max_depth': 15 } } pipeline.set_hyperparameters(new_hyperparameters) hyperparameters = pipeline.get_hyperparameters() - hyperparameters['sklearn.ensemble.RandomForestClassifier#1']['max_depth'] + hyperparameters['xgboost.XGBClassifier#1']['max_depth'] Making predictions ------------------ @@ -98,20 +102,28 @@ To do this, we first call the ``fit`` method passing the training data and the c labels. .. ipython:: python + :okwarning: - from mlblocks.datasets import load_personae - dataset = load_personae() - X_train, X_test, y_train, y_test = dataset.get_splits(1) + import pandas as pd + from sklearn.model_selection import train_test_split + + dataset = pd.read_csv('http://mlblocks.s3.amazonaws.com/census.csv') + label = dataset.pop('label') + + X_train, X_test, y_train, y_test = train_test_split(dataset, label, stratify=label) pipeline.fit(X_train, y_train) Once we have fitted our model to our data, we can call the ``predict`` method passing new data to obtain predictions from the pipeline. .. ipython:: python + :okwarning: + + from sklearn.metrics import accuracy_score predictions = pipeline.predict(X_test) predictions - dataset.score(y_test, predictions) + accuracy_score(y_test, predictions) .. _you have already installed them: install.html#additional-dependencies .. _MLPipeline class: ../api_reference.html#mlblocks.MLPipeline @@ -119,5 +131,5 @@ to obtain predictions from the pipeline. .. _hyperparameters: ../advanced_usage/hyperparameters.html .. _MLBlocks JSON Annotations: ../advanced_usage/primitives.html#json-annotations .. _get_tunable_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_tunable_hyperparameters -.. _BTB: https://github.com/HDI-Project/BTB +.. _BTB: https://github.com/MLBazaar/BTB .. _set_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.set_hyperparameters diff --git a/docs/index.rst b/docs/index.rst index 28a3f0bb..25567005 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,6 +6,10 @@ What is MLBlocks? :alt: MLBlocks :align: center +* Documentation: https://mlbazaar.github.io/MLBlocks +* Github: https://github.com/MLBazaar/MLBlocks +* License: `MIT `_ + MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning tools developed in Python, whether they are custom developments or belong to third party libraries, and build Pipelines out of them that can be fitted and then used to make predictions. @@ -31,15 +35,28 @@ At a high level: History ------- -In its first iteration in 2015, MLBlocks was designed for only multi table, multi entity temporal -data. A good reference to see our design rationale at that time is Bryan Collazo’s thesis: +In its first iteration, in 2015, MLBlocks was designed for only multi table, multi entity temporal +data. A good reference to see our design rationale at that time is Bryan Collazo’s thesis, written +under the supervision of Kalyan Veeramachaneni: * `Machine learning blocks`_. Bryan Collazo. Masters thesis, MIT EECS, 2015. -With recent availability of a multitude of libraries and tools, we decided it was time to integrate -them and expand the library to address other data types: images, text, graph, time series and -integrate with deep learning libraries. +In 2018, with recent availability of a multitude of libraries and tools, we decided it was time to +integrate them and expand the library to address other data types, like images, text, graph or +time series, as well as introduce the usage of deep learning libraries. A second iteration of our +work was then started by the hand of William Xue: + +* `A Flexible Framework for Composing End to End Machine Learning Pipelines`_. + William Xue. Masters thesis, MIT EECS, 2018. + +Later in 2018, Carles Sala joined the project to make it grow as a reliable open-source library +that would become part of a bigger software ecosystem designed to facilitate the development of +robust end-to-end solutions based on Machine Learning tools. This third iteration of our work +was presented in 2019 as part of the Machine Learning Bazaar: + +* `The Machine Learning Bazaar: Harnessing the ML Ecosystem for Effective System Development`_. + Micah J. Smith, Carles Sala, James Max Kanter, and Kalyan Veeramachaneni. Sigmod 2020. .. toctree:: :caption: Getting Started @@ -74,6 +91,7 @@ integrate with deep learning libraries. api/mlblocks api/mlblocks.datasets + api/mlblocks.discovery .. toctree:: :caption: Resources @@ -89,5 +107,7 @@ Indices and tables * :ref:`modindex` * :ref:`search` -.. _Machine learning blocks: https://github.com/HDI-Project/mlblocks -.. _tarball: https://github.com/HDI-Project/mlblocks/tarball/master +.. _Machine learning blocks: https://dai.lids.mit.edu/wp-content/uploads/2018/06/Mlblocks_Bryan.pdf + +.. _A Flexible Framework for Composing End to End Machine Learning Pipelines: https://dai.lids.mit.edu/wp-content/uploads/2018/12/William_MEng.pdf +.. _The Machine Learning Bazaar\: Harnessing the ML Ecosystem for Effective System Development: https://arxiv.org/abs/1905.08942 diff --git a/docs/pipeline.json b/docs/pipeline.json deleted file mode 100644 index c09d763c..00000000 --- a/docs/pipeline.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "primitives": [ - "sklearn.preprocessing.StandardScaler", - "sklearn.ensemble.RandomForestClassifier" - ], - "init_params": { - "sklearn.preprocessing.StandardScaler": { - "with_mean": false - }, - "sklearn.ensemble.RandomForestClassifier": { - "n_estimators": 100 - } - }, - "input_names": {}, - "output_names": {}, - "hyperparameters": { - "sklearn.preprocessing.StandardScaler#1": { - "with_mean": false, - "with_std": true - }, - "sklearn.ensemble.RandomForestClassifier#1": { - "n_jobs": -1, - "n_estimators": 100, - "criterion": "entropy", - "max_features": null, - "max_depth": 10, - "min_samples_split": 0.1, - "min_samples_leaf": 0.1, - "class_weight": null - } - }, - "tunable_hyperparameters": { - "sklearn.preprocessing.StandardScaler#1": { - "with_std": { - "type": "bool", - "default": true - } - }, - "sklearn.ensemble.RandomForestClassifier#1": { - "criterion": { - "type": "str", - "default": "entropy", - "values": [ - "entropy", - "gini" - ] - }, - "max_features": { - "type": "str", - "default": null, - "range": [ - null, - "auto", - "log2" - ] - }, - "max_depth": { - "type": "int", - "default": 10, - "range": [ - 1, - 30 - ] - }, - "min_samples_split": { - "type": "float", - "default": 0.1, - "range": [ - 0.0001, - 0.5 - ] - }, - "min_samples_leaf": { - "type": "float", - "default": 0.1, - "range": [ - 0.0001, - 0.5 - ] - }, - "class_weight": { - "type": "str", - "default": null, - "range": [ - null, - "balanced" - ] - } - } - } -} \ No newline at end of file diff --git a/docs/pipeline_examples/graph.rst b/docs/pipeline_examples/graph.rst index 5503e739..082d12b6 100644 --- a/docs/pipeline_examples/graph.rst +++ b/docs/pipeline_examples/graph.rst @@ -30,7 +30,7 @@ additional information not found inside `X`. .. code-block:: python from mlblocks import MLPipeline - from mlblocks.datasets import load_umls + from mlprimitives.datasets import load_umls dataset = load_umls() dataset.describe() @@ -39,7 +39,7 @@ additional information not found inside `X`. primitives = [ 'networkx.link_prediction_feature_extraction', - 'mlprimitives.feature_extraction.CategoricalEncoder', + 'mlprimitives.custom.feature_extraction.CategoricalEncoder', 'sklearn.preprocessing.StandardScaler', 'xgboost.XGBClassifier' ] @@ -69,6 +69,6 @@ additional information not found inside `X`. .. _NetworkX Link Prediction: https://networkx.github.io/documentation/networkx-1.10/reference/algorithms.link_prediction.html -.. _CategoricalEncoder from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.feature_extraction.CategoricalEncoder.json +.. _CategoricalEncoder from MLPrimitives: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json .. _StandardScaler from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html .. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn diff --git a/docs/pipeline_examples/image.rst b/docs/pipeline_examples/image.rst index e8274761..e892f915 100644 --- a/docs/pipeline_examples/image.rst +++ b/docs/pipeline_examples/image.rst @@ -24,7 +24,7 @@ Gradients using the corresponding `scikit-image function`_ to later on use a sim .. code-block:: python from mlblocks import MLPipeline - from mlblocks.datasets import load_usps + from mlprimitives.datasets import load_usps dataset = load_usps() dataset.describe() @@ -61,7 +61,7 @@ and directly after go into a Single Layer CNN Classifier built on Keras using th .. code-block:: python from mlblocks import MLPipeline - from mlblocks.datasets import load_usps + from mlprimitives.datasets import load_usps dataset = load_usps() dataset.describe() @@ -107,7 +107,7 @@ to an `XGBRegressor`_ primitive. .. code-block:: python from mlblocks import MLPipeline - from mlblocks.datasets import load_handgeometry + from mlprimitives.datasets import load_handgeometry dataset = load_handgeometry() dataset.describe() @@ -136,7 +136,7 @@ to an `XGBRegressor`_ primitive. .. _USPS Dataset: https://ieeexplore.ieee.org/document/291440/ .. _OpenCV GaussianBlur function: https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html?highlight=gaussianblur#gaussianblur -.. _MLPrimitives primitive: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json +.. _MLPrimitives primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json .. _scikit-image function: http://scikit-image.org/docs/dev/api/skimage.feature.html#skimage.feature.hog .. _RandomForestClassifier from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html .. _Pretrained Networks from Keras: https://keras.io/applications/ diff --git a/docs/pipeline_examples/multi_table.rst b/docs/pipeline_examples/multi_table.rst index 109f4015..7091a374 100644 --- a/docs/pipeline_examples/multi_table.rst +++ b/docs/pipeline_examples/multi_table.rst @@ -25,7 +25,7 @@ tables are. .. code-block:: python from mlblocks import MLPipeline - from mlblocks.datasets import load_wikiqa + from mlprimitives.datasets import load_wikiqa dataset = load_wikiqa() dataset.describe() @@ -49,5 +49,5 @@ tables are. .. _WikiQA dataset: https://www.microsoft.com/en-us/research/publication/wikiqa-a-challenge-dataset-for-open-domain-question-answering/ .. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn -.. _DeepFeatureSynthesis: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/featuretools.dfs.json +.. _DeepFeatureSynthesis: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/featuretools.dfs.json .. _featuretools: https://www.featuretools.com/ diff --git a/docs/pipeline_examples/single_table.rst b/docs/pipeline_examples/single_table.rst index ee00d9c6..6a031cb1 100644 --- a/docs/pipeline_examples/single_table.rst +++ b/docs/pipeline_examples/single_table.rst @@ -5,7 +5,7 @@ In this section we will go over a few pipeline examples to show **MLBlocks** wor in different scenarios and with different types of data. For each example, we will be using example datasets which can be downloaded using the -various functions found in the ``mlblocks.datasets`` module. +various functions found in the ``mlprimitives.datasets`` module. .. note:: Even though the datasets are not especially big, some of the examples might use a considerable amount of resources, especially memory, and might take @@ -21,7 +21,7 @@ the numeric data from `The Boston Dataset`_, which we will load using the .. code-block:: python from mlblocks import MLPipeline - from mlblocks.datasets import load_boston + from mlprimitives.datasets import load_boston dataset = load_boston() dataset.describe() @@ -52,7 +52,7 @@ In this case, we will also be passing some initialization parameters for the XGB .. code-block:: python from mlblocks import MLPipeline - from mlblocks.datasets import load_iris + from mlprimitives.datasets import load_iris dataset = load_iris() dataset.describe() diff --git a/docs/pipeline_examples/text.rst b/docs/pipeline_examples/text.rst index df8a9d5a..75ca3f39 100644 --- a/docs/pipeline_examples/text.rst +++ b/docs/pipeline_examples/text.rst @@ -28,7 +28,7 @@ for later ones. import nltk from mlblocks import MLPipeline - from mlblocks.datasets import load_newsgroups + from mlprimitives.datasets import load_newsgroups dataset = load_newsgroups() dataset.describe() @@ -40,31 +40,31 @@ for later ones. # set up the pipeline primitives = [ - "mlprimitives.counters.UniqueCounter", - "mlprimitives.text.TextCleaner", - "mlprimitives.counters.VocabularyCounter", + "mlprimitives.custom.counters.UniqueCounter", + "mlprimitives.custom.text.TextCleaner", + "mlprimitives.custom.counters.VocabularyCounter", "keras.preprocessing.text.Tokenizer", "keras.preprocessing.sequence.pad_sequences", "keras.Sequential.LSTMTextClassifier" ] input_names = { - "mlprimitives.counters.UniqueCounter#1": { + "mlprimitives.custom.counters.UniqueCounter#1": { "X": "y" } } output_names = { - "mlprimitives.counters.UniqueCounter#1": { + "mlprimitives.custom.counters.UniqueCounter#1": { "counts": "classes" }, - "mlprimitives.counters.VocabularyCounter#1": { + "mlprimitives.custom.counters.VocabularyCounter#1": { "counts": "vocabulary_size" } } init_params = { - "mlprimitives.counters.VocabularyCounter#1": { + "mlprimitives.custom.counters.VocabularyCounter#1": { "add": 1 }, - "mlprimitives.text.TextCleaner#1": { + "mlprimitives.custom.text.TextCleaner#1": { "language": "en" }, "keras.preprocessing.sequence.pad_sequences#1": { @@ -105,7 +105,7 @@ to encode all the string features, and go directly into the import nltk from mlblocks import MLPipeline - from mlblocks.datasets import load_personae + from mlprimitives.datasets import load_personae dataset = load_personae() dataset.describe() @@ -116,12 +116,12 @@ to encode all the string features, and go directly into the nltk.download('stopwords') primitives = [ - 'mlprimitives.text.TextCleaner', - 'mlprimitives.feature_extraction.StringVectorizer', + 'mlprimitives.custom.text.TextCleaner', + 'mlprimitives.custom.feature_extraction.StringVectorizer', 'sklearn.ensemble.RandomForestClassifier', ] init_params = { - 'mlprimitives.text.TextCleaner': { + 'mlprimitives.custom.text.TextCleaner': { 'column': 'text', 'language': 'nl' }, @@ -140,9 +140,9 @@ to encode all the string features, and go directly into the .. _Twenty Newsgroups Dataset: http://scikit-learn.org/stable/datasets/twenty_newsgroups.html -.. _TextCleaner primitive: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/text.py -.. _StringVectorizer primitive: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/feature_extraction.py +.. _TextCleaner primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlprimitives/text.py +.. _StringVectorizer primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlprimitives/feature_extraction.py .. _keras text preprocessing: https://keras.io/preprocessing/text/ -.. _Keras LSTM Classifier from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.LSTMTextClassifier.json +.. _Keras LSTM Classifier from MLPrimitives: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.LSTMTextClassifier.json .. _Personae Dataset: https://www.clips.uantwerpen.be/datasets/personae-corpus .. _RandomForestClassifier from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 00000000..de298ef2 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,57 @@ +# MLBlocks Examples + +This folder contains Python code, Jupyter Notebooks and JSON examples to demonstrate MLBlocks +functionaliry. + +Within this folder you will find: + + +* `primitives`: Example primitive JSONs to demonstrate different MLBlocks functionalities. +* `pipelines`: Example pipeline JSONs to demonstrate different MLBlocks functionalities. +* `tutorials`: Collection of Jupyter Notebooks to show the usage of different MLBlocks functionalities. + + +# Requirements + +In order to run the examples contained in this folder you should have [pip installed on your system +](https://pip.pypa.io/en/stable/installing/). + +Optionally, also install and activate a [virtualenv](https://virtualenv.pypa.io/en/latest/) to +run them in an isolated environment. + +# Usage + +In order to run these tutorials on your computer, please follow these steps: + +1. Clone this github repository: + +```bash +git clone git@github.com:MLBazaar/MLBlocks.git +``` + +2. (Optional) Create a virtualenv to execute the examples in an environment isolated from the +rest of your computer: + +```bash +pip install virtualenv +virtualenv -p $(which python3.6) mlblocks-venv +soucre mlblocks-venv/bin/activate +``` + +3. Enter the repository and install the dependencies + +```bash +cd MLBlocks +make install-examples +``` + +This will install [MLBLocks](https://github.com/MLBazaar/MLBlocks.git) as well as [MLPrimitives]( +https://github.com/MLBazaar/MLPrimitives.git) and [Jupyter](https://jupyter.org/). + +4. Enter the `examples` folder and start a Jupyter Notebook: + +```bash +jupyter notebook +``` + +5. Point your browser at the link shown in your console and run the examples from the `examples/tutorials` folder. diff --git a/examples/pipelines/single_table.classification.categorical_encoder.xgboost.json b/examples/pipelines/single_table.classification.categorical_encoder.xgboost.json new file mode 100644 index 00000000..4dca4002 --- /dev/null +++ b/examples/pipelines/single_table.classification.categorical_encoder.xgboost.json @@ -0,0 +1,16 @@ +{ + "metadata": { + "data_modality": "single_table", + "task_type": "classification" + }, + "validation": { + "dataset": "census" + }, + "primitives": [ + "mlprimitives.custom.preprocessing.ClassEncoder", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "sklearn.impute.SimpleImputer", + "xgboost.XGBClassifier", + "mlprimitives.custom.preprocessing.ClassDecoder" + ] +} diff --git a/examples/primitives/mlblocks.examples.ClassPrimitive.json b/examples/primitives/mlblocks.examples.ClassPrimitive.json new file mode 100644 index 00000000..6c29e51e --- /dev/null +++ b/examples/primitives/mlblocks.examples.ClassPrimitive.json @@ -0,0 +1,104 @@ +{ + "name": "the_primitive_name", + "primitive": "full.python.path.to.AClass", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "keyword": "optional_name_of_the_fit_method_argument", + "description": "each input can be described", + "type": "pandas.DataFrame" + }, + { + "name": "y", + "description": "each input can be described", + "default": "default_value_for_this_argument", + "type": "pandas.Series" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "keyword": "optional_name_of_the_produce_method_argument", + "description": "each input can be described", + "type": "DataFrame" + } + ], + "output": [ + { + "name": "y", + "descrtiption": "each output argument can be described", + "type": "Series" + } + ] + }, + "hyperparameters": { + "fixed": { + "a_required_hyperparameter": { + "descrtiption": "this is a non tunable hyperparameter that needs to be specified by the user because it does not have a default value", + "type": "int" + }, + "an_optional_hyperparameter": { + "descrtiption": "this is a non tunable hyperparameter that is optional because it has a default value", + "type": "int", + "default": 1 + } + }, + "tunable": { + "a_simple_range_hyperparameter": { + "description": "hyperparameter documentation can be put here", + "default": 1, + "type": "int", + "range": [1, 10] + }, + "a_categorical_hyperparameter_of_type_int": { + "description": "Note that it has the field `values` instead of `range`", + "default": 1, + "type": "int", + "values": [1, 3, 7, 10] + }, + "a_categorical_hyperparameter_of_type_str": { + "default": "a", + "type": "str", + "values": ["a", "b", "c"] + }, + "a_multi_type_hyperprameter": { + "description": "this is a hyperparameter that allows more than one type", + "type": "multitype", + "default": "auto", + "types": { + "int": { + "description": "documentation can also be included here", + "range": [1, 10] + }, + "string": { + "values": ["some", "string", "values"] + } + } + }, + "conditional_hyperparameter": { + "description": "this is a hyperparameter whose valid values depend on the value of another hyperpameter", + "type": "conditional", + "condition": "the_name_of_the_other_hyperparameter", + "values": { + "a": { + "description": "this hyperparameter definition will be used if the value of the other hyperparameter is `a`", + "type": "int", + "default": 0, + "range": [0, 10] + }, + "*": { + "description": "this will be used only if the value does not match any other definition", + "type": "float", + "default": 0.0, + "range": [0.0, 1.0] + } + } + } + } + } +} diff --git a/examples/primitives/mlblocks.examples.function_primitive.json b/examples/primitives/mlblocks.examples.function_primitive.json new file mode 100644 index 00000000..f3627bd9 --- /dev/null +++ b/examples/primitives/mlblocks.examples.function_primitive.json @@ -0,0 +1,86 @@ +{ + "name": "the_primitive_name", + "primitive": "full.python.path.to.a_function", + "produce": { + "args": [ + { + "name": "X", + "keyword": "optional_name_of_the_produce_method_argument", + "description": "each input can be described", + "type": "DataFrame" + } + ], + "output": [ + { + "descrtiption": "each output argument can be described", + "name": "y", + "type": "Series" + } + ] + }, + "hyperparameters": { + "fixed": { + "a_required_hyperparameter": { + "descrtiption": "this is a non tunable hyperparameter that needs to be specified by the user, because it does not have a default value", + "type": "int" + }, + "an_optional_hyperparameter": { + "descrtiption": "this is a non tunable hyperparameter that is optional, because it has a default value", + "type": "int", + "default": 1 + } + }, + "tunable": { + "a_simple_range_hyperparameter": { + "description": "hyperparameter documentation can be put here", + "default": 1, + "type": "int", + "range": [1, 10] + }, + "a_categorical_hyperparameter_of_type_int": { + "description": "Note that it has the filed `values` instead of `range`", + "default": 1, + "type": "int", + "values": [1, 3, 7, 10] + }, + "a_categorical_hyperparameter_of_type_str": { + "default": "a", + "type": "str", + "values": ["a", "b", "c"] + }, + "a_multi_type_hyperprameter": { + "description": "this is a hyperparameter that allows more than one type", + "type": "multitype", + "default": "auto", + "types": { + "int": { + "description": "documentation can also be included here", + "range": [1, 10] + }, + "string": { + "values": ["some", "string", "values"] + } + } + }, + "conditional_hyperparameter": { + "description": "this is a hyperparameter whose valid values depend on the value of another hyperpameter", + "type": "conditional", + "condition": "the_name_of_the_other_hyperparameter", + "values": { + "a": { + "description": "this hyperparameter definition will be used if the value of the other hyperparameter is `a`", + "type": "int", + "default": 0, + "range": [0, 10] + }, + "*": { + "description": "this will be used only if the value does not match any other definition", + "type": "float", + "default": 0.0, + "range": [0.0, 1.0] + } + } + } + } + } +} diff --git a/examples/tutorials/1. Using and MLPipeline.ipynb b/examples/tutorials/1. Using and MLPipeline.ipynb new file mode 100644 index 00000000..901cc50b --- /dev/null +++ b/examples/tutorials/1. Using and MLPipeline.ipynb @@ -0,0 +1,640 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using an MLPipeline\n", + "\n", + "In this short guide we will go over the basic MLPipeline functionality.\n", + "\n", + "We will:\n", + "\n", + "1. Load a demo dataset.\n", + "2. Build a pipeline.\n", + "3. Explore the pipeline primitives, inputs and outputs.\n", + "4. Fit the pipeline to the dataset.\n", + "5. Make predictions using the fitted pipeline.\n", + "6. Evaluate the pipeline performance." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the Dataset\n", + "\n", + "The first step will be to load the Census dataset using the function provided by mlprimitives" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from utils import load_census\n", + "\n", + "dataset = load_census()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This version of the Census dataset is prepared as a Classification (Supervised) Problem,\n", + "and has an input matrix `X` and an expected outcome `y` array." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adult Census dataset.\n", + "\n", + " Predict whether income exceeds $50K/yr based on census data. Also known as \"Adult\" dataset.\n", + "\n", + " Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean\n", + " records was extracted using the following conditions: ((AAGE>16) && (AGI>100) &&\n", + " (AFNLWGT>1)&& (HRSWK>0))\n", + "\n", + " Prediction task is to determine whether a person makes over 50K a year.\n", + "\n", + " source: \"UCI\n", + " sourceURI: \"https://archive.ics.uci.edu/ml/datasets/census+income/"\n", + " \n", + "Data Modality: single_table\n", + "Task Type: classification\n", + "Task Subtype: binary\n", + "Data shape: (32561, 14)\n", + "Target shape: (32561,)\n", + "Metric: accuracy_score\n", + "Extras: \n" + ] + } + ], + "source": [ + "dataset.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The data from the dataset can explored by looking at its `.data` and `.target` attributes." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-country
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba
\n", + "
" + ], + "text/plain": [ + " age workclass fnlwgt education education-num \\\n", + "0 39 State-gov 77516 Bachelors 13 \n", + "1 50 Self-emp-not-inc 83311 Bachelors 13 \n", + "2 38 Private 215646 HS-grad 9 \n", + "3 53 Private 234721 11th 7 \n", + "4 28 Private 338409 Bachelors 13 \n", + "\n", + " marital-status occupation relationship race sex \\\n", + "0 Never-married Adm-clerical Not-in-family White Male \n", + "1 Married-civ-spouse Exec-managerial Husband White Male \n", + "2 Divorced Handlers-cleaners Not-in-family White Male \n", + "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", + "4 Married-civ-spouse Prof-specialty Wife Black Female \n", + "\n", + " capital-gain capital-loss hours-per-week native-country \n", + "0 2174 0 40 United-States \n", + "1 0 0 13 United-States \n", + "2 0 0 40 United-States \n", + "3 0 0 40 United-States \n", + "4 0 0 40 Cuba " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K'], dtype=object)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.target[0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset data can also be splitted in multipe parts for cross validation using the `dataset.get_splits` method.\n", + "\n", + "For this demo we will be making only one split, which is equivalent to a simple train/test holdout partitioning." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = dataset.get_splits(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(24420, 14)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(8141, 14)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build a pipeline\n", + "\n", + "Once we have the dataset we will build a pipeline that works with it.\n", + "\n", + "In this case, we will be creating a short pipeline that uses the following primitives:\n", + "\n", + "- `ClassEncoder` from `mlprimitives`, which encodes the target variable `y` as integers.\n", + "- `CategoricaEncoder` from `mlprimitives`, which encodes all the categorical variables from the feature matrix `X`\n", + " using one-hot encoding.\n", + "- `SimpleImputer` from `sklearn`, which imputes any null values that may exist in the feature matrix `X`\n", + "- `XGBClassifier` from `xgboost`, which learns to predict the target variable `y` sing the feature matrix `X`.\n", + "- `ClassDecoder` from `mlprimitives`, which reverts the `ClassEncoder` transformation to return the original\n", + " target labels instead of integers." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "primitives = [\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'xgboost.XGBClassifier',\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder'\n", + "]\n", + "pipeline = MLPipeline(primitives)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore the Pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Primitives" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see the primitives included in this pipeline by having a look at its `primitives` attribute." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['mlprimitives.custom.preprocessing.ClassEncoder',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'xgboost.XGBClassifier',\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.primitives" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inputs\n", + "\n", + "We can also see the inputs of the pipeline using the `get_inputs` method.\n", + "\n", + "This will traverse the pipeline execution graph and show all the variables that need to be\n", + "provided by the user in order to fit this pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'X': {'name': 'X', 'type': 'DataFrame'},\n", + " 'y': {'name': 'y', 'type': 'ndarray'}}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.get_inputs()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, we can pass the `fit=False` argument, which will give us the variables needed\n", + "in order to make predictions." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'X': {'name': 'X', 'type': 'DataFrame'},\n", + " 'y': {'name': 'y', 'default': None, 'type': 'ndarray'}}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.get_inputs(fit=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note how the `fit` method expects two variables `X` and `y`, while the `predict`\n", + "method only needs `X`, as the `y` variable has a default value of `None`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Outputs\n", + "\n", + "Equally, we can see the outputs that the pipeline will return when used to make predictions." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'name': 'y',\n", + " 'type': 'ndarray',\n", + " 'variable': 'mlprimitives.custom.preprocessing.ClassDecoder#1.y'}]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.get_outputs()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fit the Pipeline to the Dataset\n", + "\n", + "Now that the pipeline is ready and we know its inputs and outputs, we can fit it to the\n", + "dataset by passing the training `X` and `y` variables to its `fit` method." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "pipeline.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Make Predictions\n", + "\n", + "After the pipelines finished fitting, we can try to predict the `y_test` array values by\n", + "passing the `X_test` matrix to the `pipeline.predict` method." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "predictions = pipeline.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' >50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions[0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluating the pipeline performance\n", + "\n", + "Now we can compare the predicted array with the actual test array to see how well\n", + "our pipeline performed.\n", + "\n", + "This can be done using the `dataset.score` method, which provides a suitable scoring\n", + "function for this kind of data and problem.\n", + "In this case, the dataset is just computing the accuracy score." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8602137329566393" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.score(y_test, predictions)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb new file mode 100644 index 00000000..7f14662a --- /dev/null +++ b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb @@ -0,0 +1,138 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Finding and Loading a Pipeline\n", + "\n", + "In this short tutorial we will show you how to search for pipelines suitable to solve\n", + "your prediction problem." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to find a suitable pipeline, the first thing we need is to identify\n", + "the type of problem (data modality + task type) that we are facing.\n", + "\n", + "This is a full list of current data modalities and task types that we cover:\n", + "\n", + "| Problem Type | Data Modality | Task Type |\n", + "|:-------------------------------------|:--------------|:------------------------|\n", + "| Single Table Classification | single_table | classification |\n", + "| Single Table Regression | single_table | regression |\n", + "| Single Table Collaborative Filtering | single_table | collaborative_filtering |\n", + "| Multi Table Classification | multi_table | classification |\n", + "| Multi Table Regression | multi_table | regression |\n", + "| Time Series Classification | timeseries | classification |\n", + "| Time Series Regression | timeseries | regression |\n", + "| Time Series Forecasting | timeseries | forecasting |\n", + "| Time Series Anomaly Detection | timeseries | anomaly_detection |\n", + "| Image Classification | image | classification |\n", + "| Image Regression | image | regression |\n", + "| Graph Link Prediction | graph | link_prediction |\n", + "| Graph Vertex Nomination | graph | vertex_nomination |\n", + "| Graph Community Detection | graph | community_detection |\n", + "| Graph Matching | graph | graph_matching |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have identified our data modality and task type we can use the\n", + "`mlblocks.discovery.find_pipelines` function to find all the pipelines\n", + "that support this particular problem type.\n", + "\n", + "For example, if we are looking for a pipeline to work on Image Classification\n", + "we will do the following query." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['image.classification.hog.rf',\n", + " 'image.classification.hog.xgb',\n", + " 'image.classification.resnet50.xgb',\n", + " 'keras.Sequential.SingleLayerCNNImageClassifier',\n", + " 'keras.Sequential.VGGCNNClassifier']" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from mlblocks.discovery import find_pipelines\n", + "\n", + "filters = {\n", + " 'metadata.data_type': 'image',\n", + " 'metadata.task_type': 'classification',\n", + "}\n", + "\n", + "find_pipelines(filters=filters)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After finding and choosing a pipeline, we can load it as an `MLPipeline`\n", + "by passing its name to the `MLPipeline`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n", + "2020-09-16 16:03:19,939 - WARNING - tensorflow - From /home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "If using Keras pass *_constraint arguments to layers.\n", + "2020-09-16 16:03:20,025 - WARNING - tensorflow - From /home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4070: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n", + "\n" + ] + } + ], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "pipeline = MLPipeline('image.classification.resnet50.xgb')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb new file mode 100644 index 00000000..7aa0ab2b --- /dev/null +++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb @@ -0,0 +1,453 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setting MLPipeline Hyperparameters\n", + "\n", + "In this short guide we will see how to modify the hyperparameters\n", + "of an MLPipeline in order to modify its behavior or performance.\n", + "\n", + "Note that some steps are not explained for simplicity. Full details\n", + "about them can be found in the previous parts of the tutorial.\n", + "\n", + "We will:\n", + "\n", + "1. Load a dataset and a Pipeline.\n", + "2. Explore the pipeline hyperparamters.\n", + "3. Reload the pipeline with different hyperparameters.\n", + "4. Evaluate the pipeline performance on the dataset.\n", + "5. Set different pipeline hyperparameters.\n", + "6. Re-evaluate the pipeline performance on the dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the Dataset and the Pipeline\n", + "\n", + "The first step will be to load the dataset and the pipeline that we will be using." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from utils import load_census\n", + "\n", + "dataset = load_census()\n", + "X_train, X_test, y_train, y_test = dataset.get_splits(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "primitives = [\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'xgboost.XGBClassifier',\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder'\n", + "]\n", + "pipeline = MLPipeline(primitives)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore the Pipeline Hyperparameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have loaded the pipeline, we can see the hyperparameters that it is using by\n", + "calling its `get_hyperparameters` method." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n", + " 'copy': True,\n", + " 'features': 'auto',\n", + " 'max_unique_ratio': 0,\n", + " 'max_labels': 0},\n", + " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n", + " 'fill_value': None,\n", + " 'verbose': False,\n", + " 'copy': True,\n", + " 'strategy': 'mean'},\n", + " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n", + " 'n_estimators': 100,\n", + " 'max_depth': 3,\n", + " 'learning_rate': 0.1,\n", + " 'gamma': 0,\n", + " 'min_child_weight': 1},\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.get_hyperparameters()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This will return us a dictionary that contains one entry for each step in the pipeline.\n", + "Each entry will also be a dictionary, indicating the names and the values of the hyperparameters of that step.\n", + "\n", + "**NOTE** that here we see the names of the pipeline steps, which are the primitive names with a numerical suffix that allows us to tell the difference between multiple steps that use the same primitive. \n", + "\n", + "Alternatively, for better compatibility with tuning systems like [BTB](https://github.com/MLBazaar/BTB)\n", + "that work with flat, one-level, dictionaries, the argument `flat=True` can be passed." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'keep'): False,\n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'copy'): True,\n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'features'): 'auto',\n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_unique_ratio'): 0,\n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 0,\n", + " ('sklearn.impute.SimpleImputer#1', 'missing_values'): nan,\n", + " ('sklearn.impute.SimpleImputer#1', 'fill_value'): None,\n", + " ('sklearn.impute.SimpleImputer#1', 'verbose'): False,\n", + " ('sklearn.impute.SimpleImputer#1', 'copy'): True,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('xgboost.XGBClassifier#1', 'n_jobs'): -1,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.get_hyperparameters(flat=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This will return us the same information as before, but organized a single one-level\n", + "dictionary where each key is a `tuple` containing both the name of the step and the hyperparameter." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting Pipeline hyperparameter values\n", + "\n", + "We can set some different hyperparameter values when loading the pipeline by adding the\n", + "`init_params` argument to `MLPipeline`.\n", + "\n", + "The `init_params` has to be a dictionary where each entry corresponds to the name of one of the\n", + "pipeline steps and each value is another dictionary indicating the hyperparameter values that we\n", + "want to use on that step.\n", + "\n", + "As an example, we will set a different imputer strategy and a different xgboost max dempt." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "init_params = {\n", + " 'sklearn.impute.SimpleImputer#1': {\n", + " 'strategy': 'median'\n", + " },\n", + " 'xgboost.XGBClassifier#1': {\n", + " 'max_depth': 4\n", + " }\n", + "}\n", + "pipeline = MLPipeline(\n", + " primitives,\n", + " init_params=init_params\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now see how the hyperparameters are different than before." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n", + " 'copy': True,\n", + " 'features': 'auto',\n", + " 'max_unique_ratio': 0,\n", + " 'max_labels': 0},\n", + " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n", + " 'fill_value': None,\n", + " 'verbose': False,\n", + " 'copy': True,\n", + " 'strategy': 'median'},\n", + " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n", + " 'max_depth': 4,\n", + " 'n_estimators': 100,\n", + " 'learning_rate': 0.1,\n", + " 'gamma': 0,\n", + " 'min_child_weight': 1},\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.get_hyperparameters()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluate the Pipeline performance\n", + "\n", + "We can now evaluate the pipeline performance to see what results these\n", + "hyperparameters produce." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "0.8647586291610367" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.fit(X_train, y_train)\n", + "y_pred = pipeline.predict(X_test)\n", + "\n", + "dataset.score(y_test, y_pred)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting hyperparameter values\n", + "\n", + "Another way of setting the pipeline hyperparameters without having to recreate it\n", + "from scratch, is to use its `set_hyperparameters` method.\n", + "\n", + "In this case, we will change the CategoricalEncoder `max_labels` and the xgboost `learning_rate`." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "hyperparameters = {\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {\n", + " 'max_labels': 10\n", + " },\n", + " 'xgboost.XGBClassifier#1': {\n", + " 'learning_rate': 0.3\n", + " }\n", + "}\n", + "pipeline.set_hyperparameters(hyperparameters)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, the hyperparameters can be set using the `flat` format:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "hyperparameters = {\n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 10,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3\n", + "}\n", + "pipeline.set_hyperparameters(hyperparameters)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we can see how these hyperparameters now are different than before:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n", + " 'copy': True,\n", + " 'features': 'auto',\n", + " 'max_unique_ratio': 0,\n", + " 'max_labels': 10},\n", + " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n", + " 'fill_value': None,\n", + " 'verbose': False,\n", + " 'copy': True,\n", + " 'strategy': 'median'},\n", + " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n", + " 'max_depth': 4,\n", + " 'n_estimators': 100,\n", + " 'learning_rate': 0.3,\n", + " 'gamma': 0,\n", + " 'min_child_weight': 1},\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.get_hyperparameters()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluate the Pipeline performance\n", + "\n", + "We can now evaluate again the pipeline performance and see how the hyperparameter\n", + "change affected the pipeline performance." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "0.870531875690947" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.fit(X_train, y_train)\n", + "y_pred = pipeline.predict(X_test)\n", + "\n", + "dataset.score(y_test, y_pred)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb new file mode 100644 index 00000000..ec1c6f97 --- /dev/null +++ b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb @@ -0,0 +1,197 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Saving and Loading a Pipeline\n", + "\n", + "This short guide shows how serialize a Pipeline into a file and later on load it\n", + "to make predictions.\n", + "\n", + "Note that some steps are not explained for simplicity. Full details\n", + "about them can be found in the previous parts of the tutorial.\n", + "\n", + "We will:\n", + "\n", + "1. Load and fit a pipeline to a dataset\n", + "2. Save the pipeline to a file.\n", + "3. Load the pipeline as a new object.\n", + "4. Make predictions using the new pipeline object." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fit the pipeline\n", + "\n", + "The first step will be to load and fit the pipeline to the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from utils import load_census\n", + "\n", + "dataset = load_census()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = dataset.get_splits(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "primitives = [\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'xgboost.XGBClassifier',\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder'\n", + "]\n", + "pipeline = MLPipeline(primitives)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "pipeline.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save the Pipeline\n", + "\n", + "Once the pipeline is fit and ready to make predictions we can store it in a file.\n", + "We will do so using [pickle](https://docs.python.org/3/library/pickle.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "with open('pipeline.pkl', 'wb') as f:\n", + " pickle.dump(pipeline, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the Pipeline\n", + "\n", + "The saved pipeline can then be moved to another system where we can load it back to\n", + "memory using pickle again." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "with open('pipeline.pkl', 'rb') as f:\n", + " loaded_pipeline = pickle.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**IMPORTANT**: All the dependencies need to also be installed in the system that is loading the pipeline. This includes **MLBlocks** and **MLPrimitives** or any other libraries required by the pipeline primitives." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Make Predictions\n", + "\n", + "Once the pipeline is loaded it is ready to make predictions again" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "pred = pipeline.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' >50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred[0:5]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb new file mode 100644 index 00000000..769a69c1 --- /dev/null +++ b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb @@ -0,0 +1,721 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Partial execution and pipeline debugging\n", + "\n", + "In this guide we will show you how to execute a pipeline partially in order to\n", + "debug its internal behavior or optimize tuning processes.\n", + "\n", + "Note that some steps are not explained for simplicity. Full details\n", + "about them can be found in the previous parts of the tutorial.\n", + "\n", + "We will:\n", + "\n", + "1. Load a pipeline and a dataset\n", + "2. Explore the context after fitting the first primitive.\n", + "3. Fit the rest of the pipeline\n", + "4. Partial execution during Predict\n", + "5. Rerunning the last steps" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load a pipeline and a datset\n", + "\n", + "The first step will be to load the Census dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from utils import load_census\n", + "\n", + "dataset = load_census()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = dataset.get_splits(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a reminder, we have a loot at what the `X` and `y` variables that we will be passing to our\n", + "pipeline look like." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`X` is a `pandas.DataFrame` that conatins the demographics data of the subjects:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-country
2829125Private193379Assoc-acdm12Never-marriedCraft-repairNot-in-familyWhiteMale0045United-States
2863655Federal-gov176904HS-grad9Married-civ-spouseExec-managerialHusbandWhiteMale0040United-States
791930Private284395HS-grad9Married-civ-spouseCraft-repairHusbandWhiteMale0050United-States
2486117Private23934610th6Never-marriedOther-serviceOwn-childWhiteMale0018United-States
2348051Private57698HS-grad9Married-spouse-absentOther-serviceUnmarriedWhiteFemale0040United-States
\n", + "
" + ], + "text/plain": [ + " age workclass fnlwgt education education-num \\\n", + "28291 25 Private 193379 Assoc-acdm 12 \n", + "28636 55 Federal-gov 176904 HS-grad 9 \n", + "7919 30 Private 284395 HS-grad 9 \n", + "24861 17 Private 239346 10th 6 \n", + "23480 51 Private 57698 HS-grad 9 \n", + "\n", + " marital-status occupation relationship race \\\n", + "28291 Never-married Craft-repair Not-in-family White \n", + "28636 Married-civ-spouse Exec-managerial Husband White \n", + "7919 Married-civ-spouse Craft-repair Husband White \n", + "24861 Never-married Other-service Own-child White \n", + "23480 Married-spouse-absent Other-service Unmarried White \n", + "\n", + " sex capital-gain capital-loss hours-per-week native-country \n", + "28291 Male 0 0 45 United-States \n", + "28636 Male 0 0 40 United-States \n", + "7919 Male 0 0 50 United-States \n", + "24861 Male 0 0 18 United-States \n", + "23480 Female 0 0 40 United-States " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And `y` is a `numpy.ndarray` that contains the label that indicates whether the subject has a salary\n", + "above or under 50K." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K'], dtype=object)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train[0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we build a suitable pipeline for our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "primitives = [\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'xgboost.XGBClassifier',\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder'\n", + "]\n", + "pipeline = MLPipeline(primitives)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore the context after fitting the first primitive\n", + "\n", + "Once we know what primitives we are executing, we will execute only the first one\n", + "and see how the context changed after it.\n", + "\n", + "For this, we will execute the `fit` method passing the index of the last pipeline\n", + "step that we want to execute before returning. In this case, `0`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "fit_context = pipeline.fit(X_train, y_train, output_=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**NOTE**: Optionally, instead of passing the pipeline step index, we could pass the complete name\n", + "of the step, including the counter number: `mlprimitives.custom.preprocessing.ClassEncoder#1`" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "output_step = 'mlprimitives.custom.preprocessing.ClassEncoder#1'\n", + "fit_context = pipeline.fit(X_train, y_train, output_=output_step)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In both cases, the output will be a dictionary containing all the context variables after\n", + "fitting and producing the first pipeline step." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['X', 'y', 'classes'])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit_context.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice how we find the `X` and `y` variables that we passed to the `fit` method, but also a new `classes` variable\n", + "that was generated by the `mlprimitives.custom.preprocessing.ClassEncoder` primitive of the first pipeline step.\n", + "\n", + "This `classes` variable contains the list of unique values that the variable `y` originally had." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' <=50K', ' >50K'], dtype=object)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit_context['classes']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Also notice that the variable `y` has been transformed by the primitive into an array of\n", + "integer values." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit_context['y'][0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fit the rest of the pipeline\n", + "\n", + "After exploring the context generated by the first pipeline step we will now run\n", + "a few steps more, up to the point where the feature matrix is ready for the XGBClassifier.\n", + "\n", + "For this we will run the `fit` method again passing back the context that we just obtained\n", + "as well as the `start_` argument indicating that we need to start fitting on the second\n", + "step of the pipeline, skipping the first one, and the `output_` argument indicating that\n", + "we want to stop on the third step, right before the `XGBClassifier` primitive.\n", + "\n", + "Note how the context is passed using a double asterisk `**` syntax, but that individual\n", + "variables could also be passed as keyword arguments." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "fit_context = pipeline.fit(start_=1, output_=2, **fit_context)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now the context still contains the same variables as before" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['classes', 'X', 'y'])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit_context.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But the variable `X` has been completely modified by the CategoricalEncoder and Imputer\n", + "primitives, so now it is a 100% numerical `numpy.ndarray` ready for the `XGBClassifier`" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([2.50000e+01, 1.93379e+05, 1.20000e+01, 0.00000e+00, 0.00000e+00,\n", + " 4.50000e+01, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n", + " 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit_context['X'][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we can pass the new context to the rest of the pipeline to finish fitting it.\n", + "\n", + "Note how, just like the `output_`, the `start_` step can also be indicated using the step\n", + "name instead of the index." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.fit(start_='xgboost.XGBClassifier#1', **fit_context)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Partial execution during Predict\n", + "\n", + "Just like in the `fit` stage, the `predict` method also accepts a partial output specification." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "predict_context = pipeline.predict(X_test, output_=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['X', 'y'])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predict_context.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As well as a partial execution after a specific pipeline step" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "predictions = pipeline.predict(start_=3, **predict_context)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' >50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions[0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Rerunning the last steps\n", + "\n", + "One of the key advantages of the partial execution that we just explored is the\n", + "possibility to re-fit and make new predictions multiple times with different\n", + "hyperparameter values for the last half of the pipeline without the need to\n", + "re-fit and re-execute the first half.\n", + "\n", + "This has the potential to greatly accelerate tuning processes in cases where there\n", + "are no tunable hyperparameters (or there are but we do not want to tune them) in\n", + "the preprocessing steps but the execution times are long.\n", + "\n", + "As an example, let's evaluate the performance of the pipeline and try to optimize\n", + "it by changing some hyperparameters of the classifier." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8602137329566393" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.score(y_test, predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "hyperparameters = {\n", + " 'xgboost.XGBClassifier#1': {\n", + " 'learning_rate': 0.5\n", + " }\n", + "}\n", + "pipeline.set_hyperparameters(hyperparameters)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.fit(start_=3, **fit_context)\n", + "predictions = pipeline.predict(start_=3, **predict_context)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.872251566146665" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.score(y_test, predictions)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/tutorials/6. Flexible outputs specification.ipynb b/examples/tutorials/6. Flexible outputs specification.ipynb new file mode 100644 index 00000000..6ecad5a5 --- /dev/null +++ b/examples/tutorials/6. Flexible outputs specification.ipynb @@ -0,0 +1,535 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Flexible outputs specification\n", + "\n", + "In a previous tutorial we have learnt how to obtain intermediate pipeline\n", + "outputs in order to debug its internal behavior.\n", + "\n", + "In this guide we will go a bit further and learn how to define flexible outputs\n", + "for the pipeline in order to obtain the output of multiple primitives\n", + "at once.\n", + "\n", + "Note that some steps are not explained for simplicity. Full details\n", + "about them can be found in the previous parts of the tutorial.\n", + "\n", + "We will:\n", + "\n", + "1. Load a pipeline and a dataset\n", + "2. Explore the output specification formats" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load a pipeline and a datset\n", + "\n", + "The first step will be to load the Census dataset and the pipeline that we will be using." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from utils import load_census\n", + "\n", + "dataset = load_census()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = dataset.get_splits(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "primitives = [\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'xgboost.XGBClassifier',\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder'\n", + "]\n", + "pipeline = MLPipeline(primitives)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Also, just as a reminder, let's have a quick look at the steps of this pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['mlprimitives.custom.preprocessing.ClassEncoder',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'xgboost.XGBClassifier',\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.primitives" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And at the `X` and `y` variables that we will be passing to our pipeline." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`X` is a `pandas.DataFrame` that conatins the demographics data of the subjects:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-country
2829125Private193379Assoc-acdm12Never-marriedCraft-repairNot-in-familyWhiteMale0045United-States
2863655Federal-gov176904HS-grad9Married-civ-spouseExec-managerialHusbandWhiteMale0040United-States
791930Private284395HS-grad9Married-civ-spouseCraft-repairHusbandWhiteMale0050United-States
2486117Private23934610th6Never-marriedOther-serviceOwn-childWhiteMale0018United-States
2348051Private57698HS-grad9Married-spouse-absentOther-serviceUnmarriedWhiteFemale0040United-States
\n", + "
" + ], + "text/plain": [ + " age workclass fnlwgt education education-num \\\n", + "28291 25 Private 193379 Assoc-acdm 12 \n", + "28636 55 Federal-gov 176904 HS-grad 9 \n", + "7919 30 Private 284395 HS-grad 9 \n", + "24861 17 Private 239346 10th 6 \n", + "23480 51 Private 57698 HS-grad 9 \n", + "\n", + " marital-status occupation relationship race \\\n", + "28291 Never-married Craft-repair Not-in-family White \n", + "28636 Married-civ-spouse Exec-managerial Husband White \n", + "7919 Married-civ-spouse Craft-repair Husband White \n", + "24861 Never-married Other-service Own-child White \n", + "23480 Married-spouse-absent Other-service Unmarried White \n", + "\n", + " sex capital-gain capital-loss hours-per-week native-country \n", + "28291 Male 0 0 45 United-States \n", + "28636 Male 0 0 40 United-States \n", + "7919 Male 0 0 50 United-States \n", + "24861 Male 0 0 18 United-States \n", + "23480 Female 0 0 40 United-States " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And `y` is a `numpy.ndarray` that contains the label that indicates whether the subject has a salary\n", + "above or under 50K." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K'], dtype=object)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train[0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore the output specification formats\n", + "\n", + "In the previous tutorial we learnt that the output of a pipeline can be specified\n", + "in multiple formats:\n", + "\n", + "* An integer indicating the pipeline step index, which will return us the complete\n", + " context after producing the corresponding step.\n", + "* A string indicating the name of a step, which will also return us the complete\n", + " context after producing the corresponding step.\n", + " \n", + "A part from these two options, there are a few more.\n", + "\n", + "### Single variable specification\n", + "\n", + "Variables can be individually specified by passing a string in the format\n", + "`{pipeline-step-name}.{variable-name}`.\n", + "\n", + "Note that the `pipeline-step-name` part is not only the primitive name, but\n", + "also the counter number at the end of it.\n", + "\n", + "For example, if we want to explore the `classes` variable generated by\n", + "the `ClassEncoder` primitive during `fit`, we can do the following:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' <=50K', ' >50K'], dtype=object)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_spec = 'mlprimitives.custom.preprocessing.ClassEncoder#1.classes'\n", + "pipeline.fit(X_train, y_train, output_=output_spec)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**NOTE**: Just like with the full context specification, when a variable is specified\n", + "the pipeline will be executed only up to the step that produces the indicated variable." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List of variables\n", + "\n", + "In some cases we will be interested in obtaining more than one variable\n", + "at a time.\n", + "\n", + "In order to do this, instead of a single string specification we can pass\n", + "a list of strings." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "output_spec = [\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder#1.y',\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder#1.classes',\n", + "]\n", + "out = pipeline.fit(X_train, y_train, output_=output_spec)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output will be a `tuple` containing the variables in the specified order." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "y, classes = out" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we want to obtain variables from multiple pipeline steps we simply need\n", + "to specify all of them at once. Again, **MLBlocks** will run all the necessary\n", + "pipeline steps, accumulating the desired variables up to the last step needed." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "output_spec = [\n", + " 'sklearn.impute.SimpleImputer#1.X',\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder#1.y',\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder#1.classes',\n", + "]\n", + "X, y, classes = pipeline.fit(X_train, y_train, output_=output_spec)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If required, we can even capture the same variable along the different pipeline steps!" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "output_spec = [\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1.X',\n", + " 'sklearn.impute.SimpleImputer#1.X',\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder#1.y',\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder#1.classes',\n", + "]\n", + "X_1, X_2, y, classes = pipeline.fit(X_train, y_train, output_=output_spec)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(24420, 108)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_1.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(24420, 108)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_2.shape" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/tutorials/7. Tuning a Pipeline.ipynb b/examples/tutorials/7. Tuning a Pipeline.ipynb new file mode 100644 index 00000000..484e0b22 --- /dev/null +++ b/examples/tutorials/7. Tuning a Pipeline.ipynb @@ -0,0 +1,464 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tuning a Pipeline\n", + "\n", + "This short guide shows how tune a Pipeline using a [BTB](https://github.com/MLBazaar/BTB) Tuner.\n", + "\n", + "Note that some steps are not explained for simplicity. Full details\n", + "about them can be found in the previous parts of the tutorial.\n", + "\n", + "Here we will:\n", + "1. Load a dataset and a pipeline\n", + "2. Explore the pipeline tunable hyperparameters\n", + "3. Write a scoring function\n", + "4. Build a BTB Tunable and BTB Tuner.\n", + "5. Write a tuning loop" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load dataset and the pipeline\n", + "\n", + "The first step will be to load the dataset that we were using in previous tutorials." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from utils import load_census\n", + "\n", + "dataset = load_census()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And load a suitable pipeline.\n", + "\n", + "Note how in this case we are using the variable name `template` instead of `pipeline`,\n", + "because this will only be used as a template for the pipelines that we will create\n", + "and evaluate during the later tuning loop." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "template = MLPipeline('single_table.classification.xgb')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore the pipeline tunable hyperparameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have loaded the pipeline, we can now extract the hyperparameters that we will tune\n", + "by calling the `get_tunable_hyperparameters` method.\n", + "\n", + "In this case we will call it using `flat=True` to obtain the hyperparameters in a format\n", + "that is compatible with BTB." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "tunable_hyperparameters = template.get_tunable_hyperparameters(flat=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): {'type': 'int', 'default': 0, 'range': [0, 100]},\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): {'type': 'str',\n", + " 'default': 'mean',\n", + " 'values': ['mean', 'median', 'most_frequent', 'constant']},\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): {'type': 'int',\n", + " 'default': 100,\n", + " 'range': [10, 1000]},\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): {'type': 'int',\n", + " 'default': 3,\n", + " 'range': [3, 10]},\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): {'type': 'float',\n", + " 'default': 0.1,\n", + " 'range': [0, 1]},\n", + " ('xgboost.XGBClassifier#1', 'gamma'): {'type': 'float',\n", + " 'default': 0,\n", + " 'range': [0, 1]},\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): {'type': 'int',\n", + " 'default': 1,\n", + " 'range': [1, 10]}}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tunable_hyperparameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Write a scoring function\n", + "\n", + "To tune the pipeline we will need to evaluate its performance multiple times with different hyperparameters.\n", + "\n", + "For this reason, we will start by writing a scoring function that will expect only one\n", + "input, the hyperparameters dictionary, and evaluate the performance of the pipeline using them.\n", + "\n", + "In this case, the evaluation will be done using 5-fold cross validation based on the `get_splits`\n", + "method from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "def cross_validate(hyperparameters=None):\n", + " scores = []\n", + " for X_train, X_test, y_train, y_test in dataset.get_splits(5):\n", + " pipeline = MLPipeline(template.to_dict()) # Make a copy of the template\n", + " if hyperparameters:\n", + " pipeline.set_hyperparameters(hyperparameters)\n", + "\n", + " pipeline.fit(X_train, y_train)\n", + " y_pred = pipeline.predict(X_test)\n", + " \n", + " scores.append(dataset.score(y_test, y_pred))\n", + " \n", + " return np.mean(scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By calling this function without any arguments we will obtain the score obtained\n", + "with the default hyperparameters." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.863978563379761" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "default_score = cross_validate()\n", + "default_score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Optionally, we can certify that by passing a hyperparameters dictionary the new hyperparameters\n", + "will be used, resulting on a different score." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.868554574842" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hyperparameters = {\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 4\n", + "}\n", + "cross_validate(hyperparameters)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a BTB Tunable\n", + "\n", + "The next step is to create the BTB Tunable instance that will be tuned by the BTB Tuner.\n", + "\n", + "For this we will use its `from_dict` method, passing our hyperparameters dict." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from baytune.tuning import Tunable\n", + "\n", + "tunable = Tunable.from_dict(tunable_hyperparameters)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create the BTB Tuner\n", + "\n", + "After creating the Tunable, we need to create a Tuner to tune it.\n", + "\n", + "In this case we will use the GPTuner, a Meta-model based tuner that uses a Gaussian Process Regressor\n", + "for the optimization." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from baytune.tuning import GPTuner\n", + "\n", + "tuner = GPTuner(tunable)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Optionally, since we already know the score obtained by the default arguments and\n", + "these have a high probability of being already decent, we will inform the tuner\n", + "about their performance.\n", + "\n", + "In order to obtain the default hyperparameters used before we can either call\n", + "the template `get_hyperparameters(flat=True)` method, the `tunable.get_defaults()`." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 0,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "defaults = tunable.get_defaults()\n", + "defaults" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "tuner.record(defaults, default_score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Start the Tuning loop\n", + "\n", + "Once we have the tuner ready we can the tuning loop.\n", + "\n", + "During this loop we will:\n", + "\n", + "1. Ask the tuner for a new hyperparameter proposal\n", + "2. Run the `cross_validate` function to evaluate these hyperparameters\n", + "3. Record the obtained score back to the tuner.\n", + "4. If the obtained score is better than the previous one, store the proposal." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "scoring pipeline 1\n", + "New best found: 0.871994161365419\n", + "scoring pipeline 2\n", + "New best found: 0.8723319756253888\n", + "scoring pipeline 3\n", + "scoring pipeline 4\n", + "scoring pipeline 5\n", + "scoring pipeline 6\n", + "scoring pipeline 7\n", + "scoring pipeline 8\n", + "scoring pipeline 9\n", + "scoring pipeline 10\n" + ] + } + ], + "source": [ + "best_score = default_score\n", + "best_proposal = defaults\n", + "\n", + "for iteration in range(10):\n", + " print(\"scoring pipeline {}\".format(iteration + 1))\n", + " \n", + " proposal = tuner.propose()\n", + " score = cross_validate(proposal)\n", + " \n", + " tuner.record(proposal, score)\n", + " \n", + " if score > best_score:\n", + " print(\"New best found: {}\".format(score))\n", + " best_score = score\n", + " best_proposal = proposal" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the loop has finished, the best proposal will be stored in the `best_proposal` variable,\n", + "which can be used to generate a new pipeline instance." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 60,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 190,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.13575511242790694,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.6326488945712287,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 8}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "best_proposal" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "best_pipeline = MLPipeline(template.to_dict())" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "best_pipeline.set_hyperparameters(best_proposal)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "best_pipeline.fit(dataset.data, dataset.target)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb new file mode 100644 index 00000000..a7e9d69a --- /dev/null +++ b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb @@ -0,0 +1,705 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Selecting and Tuning pipelines\n", + "\n", + "This guide shows you how to search for multiple pipelines for your problem\n", + "and later on use a [BTBSession](https://mlbazaar.github.io/BTB/api/btb.session.html#btb.session.BTBSession)\n", + "to select and tune the best one.\n", + "\n", + "Note that some steps are not explained for simplicity. Full details\n", + "about them can be found in the previous parts of the tutorial.\n", + "\n", + "Here we will:\n", + "\n", + "1. Load a dataset\n", + "2. Search and load suitable templates\n", + "3. Write a scoring function\n", + "4. Build a BTBSession for our templates\n", + "5. Run the session to find the best pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the Dataset\n", + "\n", + "The first step will be to load the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from utils import load_census\n", + "\n", + "dataset = load_census()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adult Census dataset.\n", + "\n", + " Predict whether income exceeds $50K/yr based on census data. Also known as \"Adult\" dataset.\n", + "\n", + " Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean\n", + " records was extracted using the following conditions: ((AAGE>16) && (AGI>100) &&\n", + " (AFNLWGT>1)&& (HRSWK>0))\n", + "\n", + " Prediction task is to determine whether a person makes over 50K a year.\n", + "\n", + " source: \"UCI\n", + " sourceURI: \"https://archive.ics.uci.edu/ml/datasets/census+income/"\n", + " \n", + "Data Modality: single_table\n", + "Task Type: classification\n", + "Task Subtype: binary\n", + "Data shape: (32561, 14)\n", + "Target shape: (32561,)\n", + "Metric: accuracy_score\n", + "Extras: \n" + ] + } + ], + "source": [ + "dataset.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Find and load suitable Templates" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will be using the `mlblocks.discovery.find_pipelines` function to search\n", + "for compatible pipelines.\n", + "\n", + "In this case, we will be looking for `single_table/classification` pipelines." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks.discovery import find_pipelines\n", + "\n", + "templates = find_pipelines('single_table.classification')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['single_table.classification',\n", + " 'single_table.classification.text',\n", + " 'single_table.classification.xgb']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "templates" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we will create a dictionary with MLPipeline instances that will be used as tempaltes for our tuning." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "templates_dict = {\n", + " template: MLPipeline(template)\n", + " for template in templates\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "templates_dict['single_table.classification.xgb']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a scoring function\n", + "\n", + "In order to use a `BTBSession` we will need a function that is able to score a proposal,\n", + "which will always be a pair of template name and proposed hyperparameters.\n", + "\n", + "In this case, the evaluation will be done using 5-fold cross validation over our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "def cross_validate(template_name, hyperparameters=None):\n", + " template = templates_dict[template_name]\n", + " scores = []\n", + " for X_train, X_test, y_train, y_test in dataset.get_splits(5):\n", + " pipeline = MLPipeline(template.to_dict()) # Make a copy of the template\n", + " if hyperparameters:\n", + " pipeline.set_hyperparameters(hyperparameters)\n", + "\n", + " pipeline.fit(X_train, y_train)\n", + " y_pred = pipeline.predict(X_test)\n", + " \n", + " scores.append(dataset.score(y_test, y_pred))\n", + " \n", + " return np.mean(scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup the BTBSession" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will create another dictionary with the tunable hyperparameters of each template.\n", + "This will be used by the BTBSession to know how to tune each template." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "tunables = {\n", + " name: template.get_tunable_hyperparameters(flat=True)\n", + " for name, template in templates_dict.items()\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): {'type': 'int', 'default': 0, 'range': [0, 100]},\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): {'type': 'str',\n", + " 'default': 'mean',\n", + " 'values': ['mean', 'median', 'most_frequent', 'constant']},\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): {'type': 'int',\n", + " 'default': 100,\n", + " 'range': [10, 1000]},\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): {'type': 'int',\n", + " 'default': 3,\n", + " 'range': [3, 10]},\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): {'type': 'float',\n", + " 'default': 0.1,\n", + " 'range': [0, 1]},\n", + " ('xgboost.XGBClassifier#1', 'gamma'): {'type': 'float',\n", + " 'default': 0,\n", + " 'range': [0, 1]},\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): {'type': 'int',\n", + " 'default': 1,\n", + " 'range': [1, 10]}}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tunables['single_table.classification.xgb']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then create a `BTBSession` instance passing them and the `cross_validate` function.\n", + "\n", + "We will also be setting it in `verbose` mode, so we can have a better insight on what is going on." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from baytune.session import BTBSession\n", + "\n", + "session = BTBSession(tunables, cross_validate, verbose=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Run the session" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After everything is set up, we can start running the tuning session passing it\n", + "the number of iterations that we want to perform." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "00c20e4b982f42a1873c0d12f550ee4b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/5 [00:0016) && (AGI>100) && + (AFNLWGT>1)&& (HRSWK>0)) + + Prediction task is to determine whether a person makes over 50K a year. + + source: "UCI + sourceURI: "https://archive.ics.uci.edu/ml/datasets/census+income" + """ + + dataset_path = _load('census_train') + + X = pd.read_csv(dataset_path) + y = X.pop('label').values + + return Dataset(load_census.__doc__, X, y, accuracy_score, 'single_table', + 'classification', 'binary', stratify=True) \ No newline at end of file diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index cfc0ef6a..fa7130da 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -7,20 +7,30 @@ seamlessly combining tools from any python library with a simple, common and uniform interface. * Free software: MIT license -* Documentation: https://HDI-Project.github.io/MLBlocks +* Documentation: https://MLBazaar.github.io/MLBlocks """ -from mlblocks.mlblock import MLBlock # noqa -from mlblocks.mlpipeline import MLPipeline # noqa -from mlblocks.primitives import add_primitives_path, get_primitives_paths, load_primitive # noqa +from mlblocks.discovery import ( + add_pipelines_path, add_primitives_path, find_pipelines, find_primitives, get_pipelines_paths, + get_primitives_paths, load_pipeline, load_primitive) +from mlblocks.mlblock import MLBlock +from mlblocks.mlpipeline import MLPipeline __author__ = 'MIT Data To AI Lab' __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.2.5-dev' +__version__ = '0.6.3.dev0' __all__ = [ - 'MLBlock', 'MLPipeline', 'add_primitives_path', - 'get_primitives_paths', 'load_primitive' + 'MLBlock', + 'MLPipeline', + 'add_pipelines_path', + 'add_primitives_path', + 'find_pipelines', + 'find_primitives', + 'get_pipelines_paths', + 'get_primitives_paths', + 'load_pipeline', + 'load_primitive' ] diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py deleted file mode 100644 index fba968e8..00000000 --- a/mlblocks/datasets.py +++ /dev/null @@ -1,441 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Datasets module. - -This module contains functions that allow loading datasets for easy -testing of pipelines and primitives over multiple data modalities -and task types. - -The available datasets by data modality and task type are: - -+---------------+---------------+-------------------------+ -| Dataset | Data Modality | Task Type | -+===============+===============+=========================+ -| Amazon | Graph | Community Detection | -+---------------+---------------+-------------------------+ -| DIC28 | Graph | Graph Matching | -+---------------+---------------+-------------------------+ -| UMLs | Graph | Link Prediction | -+---------------+---------------+-------------------------+ -| Nomination | Graph | Vertex Nomination | -+---------------+---------------+-------------------------+ -| USPS | Image | Classification | -+---------------+---------------+-------------------------+ -| Hand Geometry | Image | Regression | -+---------------+---------------+-------------------------+ -| Iris | Single Table | Classification | -+---------------+---------------+-------------------------+ -| Jester | Single Table | Collaborative Filtering | -+---------------+---------------+-------------------------+ -| Boston | Single Table | Regression | -+---------------+---------------+-------------------------+ -| Wiki QA | Multi Table | Classification | -+---------------+---------------+-------------------------+ -| Personae | Text | Classification | -+---------------+---------------+-------------------------+ -| News Groups | Text | Classification | -+---------------+---------------+-------------------------+ - -""" - -import io -import os -import tarfile -import urllib - -import networkx as nx -import numpy as np -import pandas as pd -from keras.preprocessing.image import img_to_array, load_img -from sklearn import datasets -from sklearn.metrics import accuracy_score, normalized_mutual_info_score, r2_score -from sklearn.model_selection import KFold, StratifiedKFold, train_test_split - -INPUT_SHAPE = [224, 224, 3] - -DATA_PATH = os.path.join( - os.path.dirname(__file__), - 'data' -) -DATA_URL = 'http://dai-mlblocks.s3.amazonaws.com/%7B%7D.tar.gz' - - -class Dataset(): - """Dataset class. - - This class represents the abstraction of a dataset and works as - a container of all the things needed in order to use a dataset - for testing. - - Among other things, it includes the actual dataset data, information - about its origin, a score function that works for this dataset, - and a method to split the data in multiple ways for goodnes-of-fit - evaluation. - - Attributes: - name (str): Name of this dataset. - description (str): Short description about the data that composes this dataset. - data (array-like): Numpy array or pandas DataFrame containing all the data of - this dataset, excluding the labels or target values. - target (array-like): Numpy array or pandas Series containing the expected labels - or values - **kwargs: Any additional keyword argument passed on initailization is also - available as instance attributes. - - Args: - description (str): Short description about the data that composes this dataset. - The first line of the description is expected to be a human friendly - name for the dataset, and will be set as the `name` attribute. - data (array-like): Numpy array or pandas DataFrame containing all the data of - this dataset, excluding the labels or target values. - target (array-like): Numpy array or pandas Series containing the expected labels - or values - score (callable): Function that will be used to compute the score of this dataset. - shuffle (bool): Whether or not to shuffle the data before splitting. - stratify (bool): Whther to use a stratified or regular KFold for splitting. - **kwargs: Any additional keyword argument passed on initialization will be made - available as instance attributes. - """ - def __init__(self, description, data, target, score, shuffle=True, stratify=False, **kwargs): - - self.name = description.splitlines()[0] - self.description = description - - self.data = data - self.target = target - - self._stratify = stratify - self._shuffle = shuffle - self._score = score - - self.__dict__.update(kwargs) - - def score(self, *args, **kwargs): - """Scoring function for this dataset. - - Args: - \\*args, \\*\\*kwargs: Any given arguments and keyword arguments will be - directly passed to the given scoring function. - - Returns: - float: - The computed score. - """ - return self._score(*args, **kwargs) - - def __repr__(self): - return self.name - - def describe(self): - """Print the description of this Dataset on stdout.""" - print(self.description) - - @staticmethod - def _get_split(data, index): - if hasattr(data, 'iloc'): - return data.iloc[index] - else: - return data[index] - - def get_splits(self, n_splits=1): - """Return splits of this dataset ready for Cross Validation. - - If n_splits is 1, a tuple containing the X for train and test - and the y for train and test is returned. - Otherwise, if n_splits is bigger than 1, a list of such tuples - is returned, one for each split. - - Args: - n_splits (int): Number of times that the data needs to be splitted. - - Returns: - tuple or list: - if n_splits is 1, a tuple containing the X for train and test - and the y for train and test is returned. - Otherwise, if n_splits is bigger than 1, a list of such tuples - is returned, one for each split. - """ - if n_splits == 1: - stratify = self.target if self._stratify else None - - return train_test_split( - self.data, - self.target, - shuffle=self._shuffle, - stratify=stratify - ) - - else: - cv_class = StratifiedKFold if self._stratify else KFold - cv = cv_class(n_splits=n_splits, shuffle=self._shuffle) - - splits = list() - for train, test in cv.split(self.data, self.target): - X_train = self._get_split(self.data, train) - y_train = self._get_split(self.target, train) - X_test = self._get_split(self.data, test) - y_test = self._get_split(self.target, test) - splits.append((X_train, X_test, y_train, y_test)) - - return splits - - -def _download(dataset_name, dataset_path): - url = DATA_URL.format(dataset_name) - response = urllib.request.urlopen(url) - bytes_io = io.BytesIO(response.read()) - - with tarfile.open(fileobj=bytes_io, mode='r:gz') as tf: - tf.extractall(DATA_PATH) - - -def _load(dataset_name): - if not os.path.exists(DATA_PATH): - os.makedirs(DATA_PATH) - - dataset_path = os.path.join(DATA_PATH, dataset_name) - if not os.path.exists(dataset_path): - _download(dataset_name, dataset_path) - - return dataset_path - - -def _load_images(image_dir, filenames): - images = [] - for filename in filenames: - filename = os.path.join(image_dir, filename) - - image = load_img(filename) - image = image.resize(tuple(INPUT_SHAPE[0:2])) - image = img_to_array(image) - image = image / 255.0 # Quantize images. - images.append(image) - - return np.array(images) - - -def _load_csv(dataset_path, name, set_index=False): - csv_path = os.path.join(dataset_path, name + '.csv') - df = pd.read_csv(csv_path) - - if set_index: - df = df.set_index(df.columns[0], drop=False) - - return df - - -def load_usps(): - """USPs Digits Dataset. - - The data of this dataset is a 3d numpy array vector with shape (224, 224, 3) - containing 9298 224x224 RGB photos of handwritten digits, and the target is - a 1d numpy integer array containing the label of the digit represented in - the image. - """ - dataset_path = _load('usps') - - df = _load_csv(dataset_path, 'data') - X = _load_images(os.path.join(dataset_path, 'images'), df.image) - y = df.label.values - - return Dataset(load_usps.__doc__, X, y, accuracy_score, stratify=True) - - -def load_handgeometry(): - """Hand Geometry Dataset. - - The data of this dataset is a 3d numpy array vector with shape (224, 224, 3) - containing 112 224x224 RGB photos of hands, and the target is a 1d numpy - float array containing the width of the wrist in centimeters. - """ - dataset_path = _load('handgeometry') - - df = _load_csv(dataset_path, 'data') - X = _load_images(os.path.join(dataset_path, 'images'), df.image) - y = df.target.values - - return Dataset(load_handgeometry.__doc__, X, y, r2_score) - - -def load_personae(): - """Personae Dataset. - - The data of this dataset is a 2d numpy array vector containing 145 entries - that include texts written by Dutch users in Twitter, with some additional - information about the author, and the target is a 1d numpy binary integer - array indicating whether the author was extrovert or not. - """ - dataset_path = _load('personae') - - X = _load_csv(dataset_path, 'data') - y = X.pop('label').values - - return Dataset(load_personae.__doc__, X, y, accuracy_score, stratify=True) - - -def load_umls(): - """UMLs Dataset. - - The data consists of information about a 135 Graph and the relations between - their nodes given as a DataFrame with three columns, source, target and type, - indicating which nodes are related and with which type of link. The target is - a 1d numpy binary integer array indicating whether the indicated link exists - or not. - """ - dataset_path = _load('umls') - - X = _load_csv(dataset_path, 'data') - y = X.pop('label').values - - graph = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph.gml'))) - - return Dataset(load_umls.__doc__, X, y, accuracy_score, stratify=True, graph=graph) - - -def load_dic28(): - """DIC28 Dataset from Pajek. - - This network represents connections among English words in a dictionary. - It was generated from Knuth's dictionary. Two words are connected by an - edge if we can reach one from the other by - - changing a single character (e. g., work - word) - - adding / removing a single character (e. g., ever - fever). - - There exist 52,652 words (vertices in a network) having 2 up to 8 characters - in the dictionary. The obtained network has 89038 edges. - """ - - dataset_path = _load('dic28') - - X = _load_csv(dataset_path, 'data') - y = X.pop('label').values - - graph1 = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph1.gml'))) - graph2 = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph2.gml'))) - - graph = graph1.copy() - graph.add_nodes_from(graph2.nodes(data=True)) - graph.add_edges_from(graph2.edges) - graph.add_edges_from(X[['graph1', 'graph2']].values) - - graphs = { - 'graph1': graph1, - 'graph2': graph2, - } - - return Dataset(load_dic28.__doc__, X, y, accuracy_score, - stratify=True, graph=graph, graphs=graphs) - - -def load_nomination(): - """Sample 1 of graph vertex nomination data from MII Lincoln Lab. - - Data consists of one graph whose nodes contain two attributes, attr1 and attr2. - Associated with each node is a label that has to be learned and predicted. - """ - - dataset_path = _load('nomination') - - X = _load_csv(dataset_path, 'data') - y = X.pop('label').values - - graph = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph.gml'))) - - return Dataset(load_nomination.__doc__, X, y, accuracy_score, stratify=True, graph=graph) - - -def load_amazon(): - """Amazon product co-purchasing network and ground-truth communities. - - Network was collected by crawling Amazon website. It is based on Customers Who Bought - This Item Also Bought feature of the Amazon website. If a product i is frequently - co-purchased with product j, the graph contains an undirected edge from i to j. - Each product category provided by Amazon defines each ground-truth community. - """ - - dataset_path = _load('amazon') - - X = _load_csv(dataset_path, 'data') - y = X.pop('label').values - - graph = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph.gml'))) - - return Dataset(load_amazon.__doc__, X, y, normalized_mutual_info_score, graph=graph) - - -def load_jester(): - """Ratings from the Jester Online Joke Recommender System. - - This dataset consists of over 1.7 million instances of (user_id, item_id, rating) - triples, which is split 50-50 into train and test data. - - source: "University of California Berkeley, CA" - sourceURI: "http://eigentaste.berkeley.edu/dataset/" - """ - - dataset_path = _load('jester') - - X = _load_csv(dataset_path, 'data') - y = X.pop('rating').values - - return Dataset(load_jester.__doc__, X, y, r2_score) - - -def load_wikiqa(): - """A Challenge Dataset for Open-Domain Question Answering. - - WikiQA dataset is a publicly available set of question and sentence (QS) pairs, - collected and annotated for research on open-domain question answering. - - source: "Microsoft" - sourceURI: "https://www.microsoft.com/en-us/research/publication/wikiqa-a-challenge-dataset-for-open-domain-question-answering/#" - """ # noqa - - dataset_path = _load('wikiqa') - - data = _load_csv(dataset_path, 'data', set_index=True) - questions = _load_csv(dataset_path, 'questions', set_index=True) - sentences = _load_csv(dataset_path, 'sentences', set_index=True) - vocabulary = _load_csv(dataset_path, 'vocabulary', set_index=True) - - entities = { - 'data': (data, 'd3mIndex', None), - 'questions': (questions, 'qIndex', None), - 'sentences': (sentences, 'sIndex', None), - 'vocabulary': (vocabulary, 'index', None) - } - relationships = [ - ('questions', 'qIndex', 'data', 'qIndex'), - ('sentences', 'sIndex', 'data', 'sIndex') - ] - - target = data.pop('isAnswer').values - - return Dataset(load_wikiqa.__doc__, data, target, accuracy_score, startify=True, - entities=entities, relationships=relationships) - - -def load_newsgroups(): - """20 News Groups Dataset. - - The data of this dataset is a 1d numpy array vector containing the texts - from 11314 newsgroups posts, and the target is a 1d numpy integer array - containing the label of one of the 20 topics that they are about. - """ - dataset = datasets.fetch_20newsgroups() - return Dataset(load_newsgroups.__doc__, np.array(dataset.data), dataset.target, - accuracy_score, stratify=True) - - -def load_iris(): - """Iris Dataset.""" - dataset = datasets.load_iris() - return Dataset(load_iris.__doc__, dataset.data, dataset.target, - accuracy_score, stratify=True) - - -def load_boston(): - """Boston House Prices Dataset.""" - dataset = datasets.load_boston() - return Dataset(load_boston.__doc__, dataset.data, dataset.target, r2_score) diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py new file mode 100644 index 00000000..24a469da --- /dev/null +++ b/mlblocks/discovery.py @@ -0,0 +1,470 @@ +# -*- coding: utf-8 -*- + +""" +Primitives and Pipelines discovery module. + +This module contains functions to load primitive and pipeline +annotations, as well as to configure how MLBlocks finds the +primitives and pipelines. +""" + +import json +import logging +import os +import re +import sys + +import pkg_resources + +LOGGER = logging.getLogger(__name__) + +_PRIMITIVES_PATHS = [ + os.path.join(os.getcwd(), 'mlprimitives'), + os.path.join(sys.prefix, 'mlprimitives'), + os.path.join(os.getcwd(), 'mlblocks_primitives'), # legacy + os.path.join(sys.prefix, 'mlblocks_primitives'), # legacy +] + +_PIPELINES_PATHS = [ + os.path.join(os.getcwd(), 'mlpipelines'), +] + + +def _add_lookup_path(path, paths): + """Add a new path to lookup. + + The new path will be inserted in the first place of the list, + so any element found in this new folder will take precedence + over any other element with the same name that existed in the + system before. + + Args: + path (str): + path to add + paths (list): + list where the new path will be added. + + Raises: + ValueError: + A ``ValueError`` will be raised if the path is not valid. + + Returns: + bool: + Whether the new path was added or not. + """ + if path not in paths: + if not os.path.isdir(path): + raise ValueError('Invalid path: {}'.format(path)) + + paths.insert(0, os.path.abspath(path)) + return True + + return False + + +def add_primitives_path(path): + """Add a new path to look for primitives. + + The new path will be inserted in the first place of the list, + so any primitive found in this new folder will take precedence + over any other primitive with the same name that existed in the + system before. + + Args: + path (str): + path to add + + Raises: + ValueError: + A ``ValueError`` will be raised if the path is not valid. + """ + added = _add_lookup_path(path, _PRIMITIVES_PATHS) + if added: + LOGGER.debug('New primitives path added: %s', path) + + +def add_pipelines_path(path): + """Add a new path to look for pipelines. + + The new path will be inserted in the first place of the list, + so any primitive found in this new folder will take precedence + over any other pipeline with the same name that existed in the + system before. + + Args: + path (str): + path to add + + Raises: + ValueError: + A ``ValueError`` will be raised if the path is not valid. + """ + added = _add_lookup_path(path, _PIPELINES_PATHS) + if added: + LOGGER.debug('New pipelines path added: %s', path) + + +def _load_entry_points(entry_point_name, entry_point_group='mlblocks'): + """Get a list of folders from entry points. + + This list will include the value of any entry point named after the given + ``entry_point_name`` published under the given ``entry_point_group``. + + An example of such an entry point would be:: + + entry_points = { + 'mlblocks': [ + 'primitives=some_module:SOME_VARIABLE' + ] + } + + where the module ``some_module`` contains a variable such as:: + + SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') + + Args: + entry_point: + The name of the ``entry_point`` to look for. + + Returns: + list: + The list of folders. + """ + lookup_paths = list() + entry_points = pkg_resources.iter_entry_points(entry_point_group) + for entry_point in entry_points: + if entry_point.name == entry_point_name: + paths = entry_point.load() + if isinstance(paths, str): + lookup_paths.append(paths) + elif isinstance(paths, (list, tuple)): + lookup_paths.extend(paths) + + return lookup_paths + + +def get_primitives_paths(): + """Get the list of folders where primitives will be looked for. + + This list will include the values of all the entry points named ``primitives`` + published under the entry point group ``mlblocks``. + + Also, for backwards compatibility reasons, the paths from the entry points + named ``jsons_path`` published under the ``mlprimitives`` group will also + be included. + + An example of such an entry point would be:: + + entry_points = { + 'mlblocks': [ + 'primitives=some_module:SOME_VARIABLE' + ] + } + + where the module ``some_module`` contains a variable such as:: + + SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') + + Returns: + list: + The list of folders. + """ + paths = _load_entry_points('primitives') + _load_entry_points('jsons_path', 'mlprimitives') + return _PRIMITIVES_PATHS + list(set(paths)) + + +def get_pipelines_paths(): + """Get the list of folders where pipelines will be looked for. + + This list will include the values of all the entry points named ``pipelines`` + published under the entry point group ``mlblocks``. + + An example of such an entry point would be:: + + entry_points = { + 'mlblocks': [ + 'pipelines=some_module:SOME_VARIABLE' + ] + } + + where the module ``some_module`` contains a variable such as:: + + SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') + + Returns: + list: + The list of folders. + """ + return _PIPELINES_PATHS + _load_entry_points('pipelines') + + +def _load_json(json_path): + with open(json_path, 'r') as json_file: + LOGGER.debug('Loading %s', json_path) + return json.load(json_file) + + +def _load(name, paths): + """Locate and load the JSON annotation in any of the given paths. + + All the given paths will be scanned to find a JSON file with the given name, + and as soon as a JSON with the given name is found it is returned. + + Args: + name (str): + Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension. + paths (list): + list of paths where the primitives will be looked for. + + Returns: + dict: + The content of the JSON annotation file loaded into a dict. + """ + if os.path.isfile(name): + return _load_json(name) + + for base_path in paths: + parts = name.split('.') + number_of_parts = len(parts) + + for folder_parts in range(number_of_parts): + folder = os.path.join(base_path, *parts[:folder_parts]) + filename = '.'.join(parts[folder_parts:]) + '.json' + json_path = os.path.join(folder, filename) + + if os.path.isfile(json_path): + return _load_json(json_path) + + +def load_primitive(name): + """Locate and load the primitive JSON annotation. + + All the primitive paths will be scanned to find a JSON file with the given name, + and as soon as a JSON with the given name is found it is returned. + + Args: + name (str): + Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension. + + Returns: + dict: + The content of the JSON annotation file loaded into a dict. + + Raises: + ValueError: + A ``ValueError`` will be raised if the primitive cannot be found. + """ + primitive = _load(name, get_primitives_paths()) + if primitive is None: + raise ValueError("Unknown primitive: {}".format(name)) + + return primitive + + +def load_pipeline(name): + """Locate and load the pipeline JSON annotation. + + All the pipeline paths will be scanned to find a JSON file with the given name, + and as soon as a JSON with the given name is found it is returned. + + Args: + name (str): + Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension. + + Returns: + dict: + The content of the JSON annotation file loaded into a dict. + + Raises: + ValueError: + A ``ValueError`` will be raised if the pipeline cannot be found. + """ + pipeline = _load(name, get_pipelines_paths()) + if pipeline is None: + raise ValueError("Unknown pipeline: {}".format(name)) + + return pipeline + + +def _search_annotations(base_path, pattern, parts=None): + """Search for annotations within the given path. + + If the indicated path has subfolders, search recursively within them. + + If a pattern is given, return only the annotations whose name + matches the pattern. + + Args: + base_path (str): + path to the folder to be searched for annotations. + pattern (str): + Regular expression to search in the annotation names. + parts (list): + Optional. List containing the parent folders that are also part + of the annotation name. Used during recursion to be able to + build the final annotation name before returning it. + + Returns: + dict: + dictionary containing paths as keys and annotation names as + values. + """ + pattern = re.compile(pattern) + annotations = dict() + parts = parts or list() + if os.path.exists(base_path): + for name in os.listdir(base_path): + path = os.path.abspath(os.path.join(base_path, name)) + if os.path.isdir(path): + annotations.update(_search_annotations(path, pattern, parts + [name])) + elif path not in annotations: + name = '.'.join(parts + [name]) + if pattern.search(name) and name.endswith('.json'): + annotations[path] = name[:-5] + + return annotations + + +def _match(annotation, key, values): + """Check if the anotation has the key and it matches any of the values. + + If the given key is not found but it contains dots, split by the dots + and consider each part a sublevel in the annotation. + + If the key value within the annotation is a list or a dict, check + whether any of the given values is contained within it instead of + checking for equality. + + Args: + annotation (dict): + Dictionary annotation. + key (str): + Key to search within the annoation. It can contain dots to + separated nested subdictionary levels within the annotation. + values (object or list): + Value or list of values to search for. + + Returns: + bool: + whether there is a match or not. + """ + if not isinstance(values, list): + values = [values] + + if key not in annotation: + if '.' in key: + name, key = key.split('.', 1) + part = annotation.get(name) or dict() + return _match(part, key, values) + else: + return False + + annotation_value = annotation[key] + + for value in values: + if isinstance(annotation_value, (list, dict)): + return value in annotation_value + elif annotation_value == value: + return True + + return False + + +def _find_annotations(paths, loader, pattern, filters): + """Find matching annotations within the given paths. + + Math annotations by both name pattern and filters. + + Args: + paths (list): + List of paths to search annotations in. + loader (callable): + Function to use to load the annotation contents. + pattern (str): + Pattern to match against the annotation name. + filters (dict): + Dictionary containing key/value filters. + + Returns: + list: + names of the matching annotations. + """ + annotations = dict() + for base_path in paths: + annotations.update(_search_annotations(base_path, pattern)) + + matching = list() + for name in sorted(annotations.values()): + annotation = loader(name) + for key, value in filters.items(): + if not _match(annotation, key, value): + break + + else: + matching.append(name) + + return matching + + +def find_primitives(pattern='', filters=None): + """Find primitives by name and filters. + + If a patter is given, only the primitives whose name matches + the pattern will be returned. + + If filters are given, they should be a dictionary containing key/value + filters that will have to be matched within the primitive annotation + for it to be included in the results. + + If the given key is not found but it contains dots, split by the dots + and consider each part a sublevel in the annotation. + + If the key value within the annotation is a list or a dict, check + whether any of the given values is contained within it instead of + checking for equality. + + Args: + pattern (str): + Regular expression to match agains the primitive names. + filters (dict): + Dictionary containing the filters to apply over the matchin + primitives. + + Returns: + list: + Names of the matching primitives. + """ + filters = filters or dict() + return _find_annotations(get_primitives_paths(), load_primitive, pattern, filters) + + +def find_pipelines(pattern='', filters=None): + """Find pipelines by name and filters. + + If a patter is given, only the pipelines whose name matches + the pattern will be returned. + + If filters are given, they should be a dictionary containing key/value + filters that will have to be matched within the pipeline annotation + for it to be included in the results. + + If the given key is not found but it contains dots, split by the dots + and consider each part a sublevel in the annotation. + + If the key value within the annotation is a list or a dict, check + whether any of the given values is contained within it instead of + checking for equality. + + Args: + pattern (str): + Regular expression to match agains the pipeline names. + filters (dict): + Dictionary containing the filters to apply over the matchin + pipelines. + + Returns: + list: + Names of the matching pipelines. + """ + filters = filters or dict() + return _find_annotations(get_pipelines_paths(), load_pipeline, pattern, filters) diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index 9b6ec0d0..d2295722 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -3,14 +3,29 @@ """Package where the MLBlock class is defined.""" import importlib +import logging +from copy import deepcopy -from mlblocks.primitives import load_primitive +from mlblocks.discovery import load_primitive + +LOGGER = logging.getLogger(__name__) def import_object(object_name): """Import an object from its Fully Qualified Name.""" - package, name = object_name.rsplit('.', 1) - return getattr(importlib.import_module(package), name) + + if isinstance(object_name, str): + parent_name, attribute = object_name.rsplit('.', 1) + try: + parent = importlib.import_module(parent_name) + except ImportError: + grand_parent_name, parent_name = parent_name.rsplit('.', 1) + grand_parent = importlib.import_module(grand_parent_name) + parent = getattr(grand_parent, parent_name) + + return getattr(parent, attribute) + + return object_name class MLBlock(): @@ -22,54 +37,59 @@ class MLBlock(): as wrapping them and providing a common interface to run them. Attributes: - name (str): Name given to this MLBlock. - primitive (object): the actual function or instance which this MLBlock - wraps. - fit_args (dict): specification of the arguments expected by the `fit` - method. - fit_method (str): name of the primitive method to call on `fit`. - `None` if the primitive is a function. - produce_args (dict): specification of the arguments expected by the - `predict` method. - produce_output (dict): specification of the outputs of the `produce` - method. - produce_method (str): name of the primitive method to call on - `produce`. `None` if the primitive is a function. + name (str): + Primitive name. + metadata (dict): + Additional information about this primitive + primitive (object): + the actual function or instance which this MLBlock wraps. + fit_args (dict): + specification of the arguments expected by the ``fit`` method. + fit_method (str): + name of the primitive method to call on ``fit``. ``None`` if the + primitive is a function. + produce_args (dict): + specification of the arguments expected by the ``predict`` method. + produce_output (dict): + specification of the outputs of the ``produce`` method. + produce_method (str): + name of the primitive method to call on ``produce``. ``None`` if the primitive is a + function. Args: - name (str): Name given to this MLBlock. - **kwargs: Any additional arguments that will be used as - hyperparameters or passed to the `fit` or `produce` - methods. + primitive (str or dict): + primitive name or primitive dictionary. + **kwargs: + Any additional arguments that will be used as hyperparameters or passed to the + ``fit`` or ``produce`` methods. Raises: - TypeError: A `TypeError` is raised if a required argument is not - found within the `kwargs` or if an unexpected - argument has been given. - """ - # pylint: disable=too-many-instance-attributes + TypeError: + A ``TypeError`` is raised if a required argument is not found within the ``kwargs`` + or if an unexpected argument has been given. + """ # pylint: disable=too-many-instance-attributes def _extract_params(self, kwargs, hyperparameters): """Extract init, fit and produce params from kwargs. - The `init_params`, `fit_params` and `produce_params` are extracted - from the passed `kwargs` taking the metadata hyperparameters as a + The ``init_params``, ``fit_params`` and ``produce_params`` are extracted + from the passed ``kwargs`` taking the metadata hyperparameters as a reference. During this extraction, make sure that all the required hyperparameters have been given and that nothing unexpected exists in the input. Args: - kwargs (dict): dict containing the Keyword arguments that have - been passed to the `__init__` method upon - initialization. - hyperparameters (dict): hyperparameters dictionary, as found in - the JSON annotation. + kwargs (dict): + dict containing the Keyword arguments that have been passed to the ``__init__`` + method upon initialization. + hyperparameters (dict): + hyperparameters dictionary, as found in the JSON annotation. Raises: - TypeError: A `TypeError` is raised if a required argument is not - found in the `kwargs` dict, or if an unexpected - argument has been given. + TypeError: + A ``TypeError`` is raised if a required argument is not found in the + ``kwargs`` dict, or if an unexpected argument has been given. """ init_params = dict() fit_params = dict() @@ -83,7 +103,7 @@ def _extract_params(self, kwargs, hyperparameters): value = param['default'] else: - raise TypeError("Required argument '{}' not found".format(name)) + raise TypeError("{} required argument '{}' not found".format(self.name, name)) init_params[name] = value @@ -91,8 +111,15 @@ def _extract_params(self, kwargs, hyperparameters): if name in kwargs: init_params[name] = kwargs.pop(name) - fit_args = [arg['name'] for arg in self.fit_args] - produce_args = [arg['name'] for arg in self.produce_args] + if not isinstance(self.fit_args, str): + fit_args = [arg['name'] for arg in self.fit_args] + else: + fit_args = [] + + if not isinstance(self.produce_args, str): + produce_args = [arg['name'] for arg in self.produce_args] + else: + produce_args = [] for name in list(kwargs.keys()): if name in fit_args: @@ -107,39 +134,61 @@ def _extract_params(self, kwargs, hyperparameters): return init_params, fit_params, produce_params - def __init__(self, name, **kwargs): + @staticmethod + def _filter_conditional(conditional, init_params): + condition = conditional['condition'] + default = conditional.get('default') + + if condition not in init_params: + return default - self.name = name + condition_value = init_params[condition] + values = conditional['values'] + return values.get(condition_value, default) - metadata = load_primitive(name) + @classmethod + def _get_tunable(cls, hyperparameters, init_params): + tunable = dict() + for name, param in hyperparameters.get('tunable', dict()).items(): + if name not in init_params: + if param['type'] == 'conditional': + param = cls._filter_conditional(param, init_params) + if param is not None: + tunable[name] = param - self.primitive = import_object(metadata['primitive']) + else: + tunable[name] = param - self._fit = metadata.get('fit', dict()) + return tunable + + def __init__(self, primitive, **kwargs): + if isinstance(primitive, str): + primitive = load_primitive(primitive) + + self.metadata = primitive + self.name = primitive['name'] + + self.primitive = import_object(self.metadata['primitive']) + + self._fit = self.metadata.get('fit', dict()) self.fit_args = self._fit.get('args', []) self.fit_method = self._fit.get('method') - self._produce = metadata['produce'] + self._produce = self.metadata['produce'] self.produce_args = self._produce['args'] self.produce_output = self._produce['output'] self.produce_method = self._produce.get('method') self._class = bool(self.produce_method) - hyperparameters = metadata.get('hyperparameters', dict()) + hyperparameters = self.metadata.get('hyperparameters', dict()) init_params, fit_params, produce_params = self._extract_params(kwargs, hyperparameters) self._hyperparameters = init_params self._fit_params = fit_params self._produce_params = produce_params - tunable = hyperparameters.get('tunable', dict()) - self._tunable = { - name: param - for name, param in tunable.items() - if name not in init_params - # TODO: filter conditionals - } + self._tunable = self._get_tunable(hyperparameters, init_params) default = { name: param['default'] @@ -150,6 +199,7 @@ def __init__(self, name, **kwargs): self.set_hyperparameters(default) def __str__(self): + """Return a string that represents this block.""" return 'MLBlock - {}'.format(self.name) def get_tunable_hyperparameters(self): @@ -165,7 +215,7 @@ def get_tunable_hyperparameters(self): tuned, their types and, if applicable, the accepted ranges or values. """ - return self._tunable.copy() + return deepcopy(self._tunable) def get_hyperparameters(self): """Get hyperparameters values that the current MLBlock is using. @@ -175,7 +225,7 @@ def get_hyperparameters(self): the dictionary containing the hyperparameter values that the MLBlock is currently using. """ - return self._hyperparameters.copy() + return deepcopy(self._hyperparameters) def set_hyperparameters(self, hyperparameters): """Set new hyperparameters. @@ -186,19 +236,57 @@ def set_hyperparameters(self, hyperparameters): If necessary, a new instance of the primitive is created. Args: - hyperparameters (dict): Dictionary containing as keys the name - of the hyperparameters and as values - the values to be used. + hyperparameters (dict): + Dictionary containing as keys the name of the hyperparameters and as + values the values to be used. """ self._hyperparameters.update(hyperparameters) if self._class: - self.instance = self.primitive(**self._hyperparameters) + LOGGER.debug('Creating a new primitive instance for %s', self.name) + self.instance = self.primitive(**self.get_hyperparameters()) + + def _get_method_kwargs(self, kwargs, method_args): + """Prepare the kwargs for the method. + + The kwargs dict will be altered according to the method_kwargs + specification to make them ready for the primitive method to + accept them. + + Args: + kwargs (dict): + keyword arguments that have been passed to the block method. + method_args (list): + method arguments as specified in the JSON annotation. + + Returns: + dict: + A dictionary containing the argument names and values to pass + to the primitive method. + """ + if isinstance(method_args, str): + method_args = getattr(self.instance, method_args)() + + method_kwargs = dict() + for arg in method_args: + name = arg['name'] + keyword = arg.get('keyword', name) + + if name in kwargs: + value = kwargs[name] + elif 'default' in arg: + value = arg['default'] + elif arg.get('required', True): + raise TypeError("missing expected argument '{}'".format(name)) + + method_kwargs[keyword] = value + + return method_kwargs def fit(self, **kwargs): """Call the fit method of the primitive. - The given keyword arguments will be passed directly to the `fit` + The given keyword arguments will be passed directly to the ``fit`` method of the primitive instance specified in the JSON annotation. If any of the arguments expected by the produce method had been @@ -208,23 +296,25 @@ def fit(self, **kwargs): the primitive is a simple function, this will be a noop. Args: - **kwargs: Any given keyword argument will be directly passed - to the primitive fit method. + **kwargs: + Any given keyword argument will be directly passed to the primitive fit method. Raises: - TypeError: A `TypeError` might be raised if any argument not - expected by the primitive fit method is given. + TypeError: + A ``TypeError`` might be raised if any argument not expected by the primitive fit + method is given. """ if self.fit_method is not None: - fit_args = self._fit_params.copy() - fit_args.update(kwargs) - getattr(self.instance, self.fit_method)(**fit_args) + fit_kwargs = self._fit_params.copy() + fit_kwargs.update(kwargs) + fit_kwargs = self._get_method_kwargs(fit_kwargs, self.fit_args) + getattr(self.instance, self.fit_method)(**fit_kwargs) def produce(self, **kwargs): """Call the primitive function, or the predict method of the primitive. The given keyword arguments will be passed directly to the primitive, - if it is a simple function, or to the `produce` method of the + if it is a simple function, or to the ``produce`` method of the primitive instance specified in the JSON annotation, if it is a class. If any of the arguments expected by the fit method had been given @@ -234,10 +324,11 @@ def produce(self, **kwargs): The output of the call to the primitive function or primitive produce method. """ - produce_args = self._produce_params.copy() - produce_args.update(kwargs) + produce_kwargs = self._produce_params.copy() + produce_kwargs.update(kwargs) + produce_kwargs = self._get_method_kwargs(produce_kwargs, self.produce_args) if self._class: - return getattr(self.instance, self.produce_method)(**produce_args) + return getattr(self.instance, self.produce_method)(**produce_kwargs) - produce_args.update(self._hyperparameters) - return self.primitive(**produce_args) + produce_kwargs.update(self.get_hyperparameters()) + return self.primitive(**produce_kwargs) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 4bad5d1f..738b13b0 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -4,8 +4,18 @@ import json import logging -from collections import Counter, OrderedDict - +import os +import re +import warnings +from collections import Counter, OrderedDict, defaultdict +from copy import deepcopy +from datetime import datetime + +import numpy as np +import psutil +from graphviz import Digraph + +from mlblocks.discovery import load_pipeline from mlblocks.mlblock import MLBlock LOGGER = logging.getLogger(__name__) @@ -34,140 +44,569 @@ class MLPipeline(): results, which will be returned as the prediction of the pipeline. Attributes: - primitives (list): List of the names of the primitives that compose - this pipeline. - blocks (list): OrderedDict of the block names and the corresponding - MLBlock instances. - init_params (dict): init_params dictionary, as given when the instance - was created. - input_names (dict): input_names dictionary, as given when the instance - was created. - output_names (dict): output_names dictionary, as given when the instance - was created. + primitives (list): + List of the names of the primitives that compose this pipeline. + blocks (list): + OrderedDict of the block names and the corresponding MLBlock instances. + init_params (dict): + init_params dictionary, as given when the instance was created. + input_names (dict): + input_names dictionary, as given when the instance was created. + output_names (dict): + output_names dictionary, as given when the instance was created. Args: - primitives (list): List with the names of the primitives that will - compose this pipeline. - init_params (dict): dictionary containing initialization arguments to - be passed when creating the MLBlocks instances. - The dictionary keys must be the corresponding - primitive names and the values must be another - dictionary that will be passed as `**kargs` to the - MLBlock instance. - input_names (dict): dictionary that maps input variable names with the - actual names expected by each primitive. This - allows reusing the same input argument for multiple - primitives that name it differently, as well as - passing different values to primitives that expect - arguments named similary. - output_names (dict): dictionary that maps output variable names with - the name these variables will be given when stored - in the context dictionary. This allows storing - the output of different primitives in different - variables, even if the primitive output name is - the same one. + pipeline (str, list, dict or MLPipeline): + The pipeline argument accepts four different types with different interpretations: + * `str`: the name of the pipeline to search and load. + * `list`: the primitives list. + * `dict`: a complete pipeline specification. + * `MLPipeline`: another pipeline to be cloned. + primitives (list): + List with the names of the primitives that will compose this pipeline. + init_params (dict): + dictionary containing initialization arguments to be passed when creating the + MLBlocks instances. The dictionary keys must be the corresponding primitive names + and the values must be another dictionary that will be passed as ``**kargs`` to the + MLBlock instance. + input_names (dict): + dictionary that maps input variable names with the actual names expected by each + primitive. This allows reusing the same input argument for multiple primitives that + name it differently, as well as passing different values to primitives that expect + arguments named similary. + output_names (dict): + dictionary that maps output variable names with the name these variables will be + given when stored in the context dictionary. This allows storing the output of + different primitives in different variables, even if the primitive output name is + the same one. + outputs (dict): + dictionary containing lists of output variables associated to a name. + verbose (bool): + whether to log the exceptions that occur when running the pipeline before + raising them or not. """ def _get_tunable_hyperparameters(self): + """Get the tunable hyperperparameters from all the blocks in this pipeline.""" tunable = {} for block_name, block in self.blocks.items(): tunable[block_name] = block.get_tunable_hyperparameters() return tunable - def __init__(self, primitives, init_params=None, input_names=None, output_names=None): - self.primitives = primitives - self.init_params = init_params or dict() - self.blocks = OrderedDict() + def _build_blocks(self): + blocks = OrderedDict() + last_fit_block = None block_names_count = Counter() - for primitive in primitives: + for primitive in self.primitives: + if isinstance(primitive, str): + primitive_name = primitive + else: + primitive_name = primitive['name'] + try: - block_names_count.update([primitive]) - block_count = block_names_count[primitive] - block_name = '{}#{}'.format(primitive, block_count) + block_names_count.update([primitive_name]) + block_count = block_names_count[primitive_name] + block_name = '{}#{}'.format(primitive_name, block_count) block_params = self.init_params.get(block_name, dict()) if not block_params: - block_params = self.init_params.get(primitive, dict()) + block_params = self.init_params.get(primitive_name, dict()) if block_params and block_count > 1: - LOGGER.warning(("Non-numbered init_params are being used " - "for more than one block %s."), primitive) + LOGGER.warning(('Non-numbered init_params are being used ' + 'for more than one block %s.'), primitive_name) block = MLBlock(primitive, **block_params) - self.blocks[block_name] = block + blocks[block_name] = block + + if bool(block._fit): + last_fit_block = block_name except Exception: - LOGGER.exception("Exception caught building MLBlock %s", primitive) + LOGGER.exception('Exception caught building MLBlock %s', primitive) raise - self.input_names = input_names or dict() - self.output_names = output_names or dict() - self._tunable_hyperparameters = self._get_tunable_hyperparameters() + return blocks, last_fit_block + + @staticmethod + def _get_pipeline_dict(pipeline, primitives): + if isinstance(pipeline, dict): + return pipeline + + elif isinstance(pipeline, str): + return load_pipeline(pipeline) + + elif isinstance(pipeline, MLPipeline): + return pipeline.to_dict() + + elif isinstance(pipeline, list): + if primitives is not None: + raise ValueError('if `pipeline` is a `list`, `primitives` must be `None`') + + return {'primitives': pipeline} + + elif pipeline is None: + if primitives is None: + raise ValueError('Either `pipeline` or `primitives` must be not `None`.') + + return dict() + + def _get_block_outputs(self, block_name): + """Get the list of output variables for the given block.""" + outputs = self._get_block_variables( + block_name, + 'produce_output', + self.output_names.get(block_name, dict()) + ) + for context_name, output in outputs.items(): + output['variable'] = '{}.{}'.format(block_name, context_name) + + return list(outputs.values()) + + def _get_block_variables(self, block_name, variables_attr, names): + """Get dictionary of variable names to the variable for a given block + + Args: + block_name (str): + Name of the block for which to get the specification + variables_attr (str): + Name of the attribute that has the variables list. It can be + `fit_args`, `produce_args` or `produce_output`. + names (dict): + Dictionary used to translate the variable names. + """ + block = self.blocks[block_name] + variables = deepcopy(getattr(block, variables_attr)) + if isinstance(variables, str): + variables = getattr(block.instance, variables)() + + variable_dict = {} + for variable in variables: + name = variable['name'] + context_name = names.get(name, name) + variable_dict[context_name] = variable + + return variable_dict + + def _get_outputs(self, pipeline, outputs): + """Get the output definitions from the pipeline dictionary. + + If the ``"default"`` entry does not exist, it is built using the + outputs from the last block in the pipeline. + """ + outputs = outputs or pipeline.get('outputs') + if outputs is None: + outputs = dict() + + if 'default' not in outputs: + outputs['default'] = self._get_block_outputs(self._last_block_name) + + return outputs + + def _get_block_name(self, index): + """Get the name of the block in the ``index`` position.""" + return list(self.blocks.keys())[index] + + def __init__(self, pipeline=None, primitives=None, init_params=None, + input_names=None, output_names=None, outputs=None, verbose=True): + + pipeline = self._get_pipeline_dict(pipeline, primitives) + + self.primitives = primitives or pipeline['primitives'] + self.init_params = init_params or pipeline.get('init_params', dict()) + self.blocks, self._last_fit_block = self._build_blocks() + self._last_block_name = self._get_block_name(-1) + + self.input_names = input_names or pipeline.get('input_names', dict()) + self.output_names = output_names or pipeline.get('output_names', dict()) + + self.outputs = self._get_outputs(pipeline, outputs) + self.verbose = verbose + + tunable = pipeline.get('tunable_hyperparameters') + if tunable is not None: + self._tunable_hyperparameters = tunable + else: + self._tunable_hyperparameters = self._get_tunable_hyperparameters() + + hyperparameters = pipeline.get('hyperparameters') + if hyperparameters: + self.set_hyperparameters(hyperparameters) + + self._re_block_name = re.compile(r'(^[^#]+#\d+)(\..*)?') + + def _get_str_output(self, output): + """Get the outputs that correspond to the str specification.""" + if output in self.outputs: + return self.outputs[output] + elif output in self.blocks: + return [{'name': output, 'variable': output}] + # return self._get_block_outputs(output) + elif '.' in output: + block_name, variable_name = output.rsplit('.', 1) + block = self.blocks.get(block_name) + if not block: + raise ValueError('Invalid block name: {}'.format(block_name)) + + for variable in block.produce_output: + if variable['name'] == variable_name: + output_variable = deepcopy(variable) + output_variable['variable'] = output + return [output_variable] + + raise ValueError('Block {} has no output {}'.format(block_name, variable_name)) + + raise ValueError('Invalid Output Specification: {}'.format(output)) + + def get_inputs(self, fit=True): + """Get a relation of all the input variables required by this pipeline. + + The result is a list contains all of the input variables. + Optionally include the fit arguments. + + Args: + fit (bool): + Optional argument to include fit arguments or not. Defaults to ``True``. + + Returns: + list: + Dictionary specifying all the input variables. + Each dictionary contains the entry ``name``, as + well as any other metadata that may have been included in the + pipeline inputs specification. + """ + inputs = dict() + for block_name in reversed(self.blocks.keys()): # iterates through pipeline backwards + produce_outputs = self._get_block_variables( + block_name, + 'produce_output', + self.output_names.get(block_name, dict()) + ) + + for produce_output_name in produce_outputs.keys(): + inputs.pop(produce_output_name, None) + + produce_inputs = self._get_block_variables( + block_name, + 'produce_args', + self.input_names.get(block_name, dict()) + ) + inputs.update(produce_inputs) + + if fit: + fit_inputs = self._get_block_variables( + block_name, + 'fit_args', + self.input_names.get(block_name, dict()) + ) + inputs.update(fit_inputs) + + return inputs + + def get_fit_args(self): + return list(self.get_inputs(fit=True).values()) + + def get_predict_args(self): + return list(self.get_inputs(fit=False).values()) + + def get_outputs(self, outputs='default'): + """Get the list of output variables that correspond to the specified outputs. + + Outputs specification can either be a single string, a single integer, or a + list of strings and integers. + + If strings are given, they can either be one of the named outputs that have + been specified on the pipeline definition or the name of a block, including the + counter number at the end, or a full variable specification following the format + ``{block-name}.{variable-name}``. + + Alternatively, integers can be passed as indexes of the blocks from which to get + the outputs. + + If output specifications that resolve to multiple output variables are given, + such as the named outputs or block names, all the variables are concatenated + together, in order, in a single variable list. + + Args: + outputs (str, int or list[str or int]): + Single or list of output specifications. + + Returns: + list: + List of dictionaries specifying all the output variables. Each + dictionary contains the entries ``name`` and ``variable``, as + well as any other metadata that may have been included in the + pipeline outputs or block produce outputs specification. + + Raises: + ValueError: + If an output specification is not valid. + TypeError: + If the type of a specification is not an str or an int. + """ + if not isinstance(outputs, (list, tuple)): + outputs = (outputs, ) + + computed = list() + for output in outputs: + if isinstance(output, int): + output = self._get_block_name(output) + + if isinstance(output, str): + computed.extend(self._get_str_output(output)) + else: + raise TypeError('Output Specification can only be str or int') + + return computed - def get_tunable_hyperparameters(self): + def get_output_names(self, outputs='default'): + """Get the names of the outputs that correspond to the given specification. + + The indicated outputs will be resolved and the names of the output variables + will be returned as a single list. + + Args: + outputs (str, int or list[str or int]): + Single or list of output specifications. + + Returns: + list: + List of variable names + + Raises: + ValueError: + If an output specification is not valid. + TypeError: + If the type of a specification is not an str or an int. + """ + outputs = self.get_outputs(outputs) + return [output['name'] for output in outputs] + + def get_output_variables(self, outputs='default'): + """Get the list of variable specifications of the given outputs. + + The indicated outputs will be resolved and their variables specifications + will be returned as a single list. + + Args: + outputs (str, int or list[str or int]): + Single or list of output specifications. + + Returns: + list: + List of variable specifications. + + Raises: + ValueError: + If an output specification is not valid. + TypeError: + If the type of a specification is not an str or an int. + """ + outputs = self.get_outputs(outputs) + return [output['variable'] for output in outputs] + + def _extract_block_name(self, variable_name): + return self._re_block_name.search(variable_name).group(1) + + def _prepare_outputs(self, outputs): + output_variables = self.get_output_variables(outputs) + outputs = output_variables.copy() + output_blocks = { + self._extract_block_name(variable) + for variable in output_variables + } + return output_variables, outputs, output_blocks + + @staticmethod + def _flatten_dict(hyperparameters): + return { + (block, name): value + for block, block_hyperparameters in hyperparameters.items() + for name, value in block_hyperparameters.items() + } + + def get_tunable_hyperparameters(self, flat=False): """Get the tunable hyperparamters of each block. + Args: + flat (bool): If True, return a flattened dictionary where each key + is a two elements tuple containing the name of the block as the first + element and the name of the hyperparameter as the second one. + If False (default), return a dictionary where each key is the name of + a block and each value is a dictionary containing the complete + hyperparameter specification of that block. + Returns: dict: A dictionary containing the block names as keys and the block tunable hyperparameters dictionary as values. """ - return self._tunable_hyperparameters.copy() + tunables = self._tunable_hyperparameters.copy() + if flat: + tunables = self._flatten_dict(tunables) + + return tunables + + @classmethod + def _sanitize_value(cls, value): + """Convert numpy values to their python primitive type equivalent. + + If a value is a dict, recursively sanitize its values. + + Args: + value: + value to sanitize. + + Returns: + sanitized value. + """ + if isinstance(value, dict): + return { + key: cls._sanitize_value(value) + for key, value in value.items() + } + if isinstance(value, np.integer): + return int(value) + elif isinstance(value, np.floating): + return float(value) + elif isinstance(value, np.ndarray): + return value.tolist() + elif isinstance(value, np.bool_): + return bool(value) + elif value == 'None': + return None + + return value + + @classmethod + def _sanitize(cls, hyperparameters): + """Convert tuple hyperparameter keys to nested dicts. + + Also convert numpy types to primary python types. + + The input hyperparameters dict can specify them in two formats: + + One is the native MLBlocks format, where each key is the name of a block and each value + is a dict containing a complete hyperparameter specification for that block:: + + { + 'block_name': { + 'hyperparameter_name': 'hyperparameter_value', + ... + }, + ... + } + + The other one is an alternative format where each key is a two element tuple containing + the name of the block as the first element and the name of the hyperparameter as the + second one:: + + { + ('block_name', 'hyperparameter_name'): 'hyperparameter_value', + ... + } + + + Args: + hyperparaeters (dict): + hyperparameters dict to sanitize. + + Returns: + dict: + Sanitized dict. + """ + params_tree = defaultdict(dict) + for key, value in hyperparameters.items(): + value = cls._sanitize_value(value) + if isinstance(key, tuple): + block, hyperparameter = key + params_tree[block][hyperparameter] = value + else: + params_tree[key] = value + + return params_tree - def get_hyperparameters(self): + def get_hyperparameters(self, flat=False): """Get the current hyperparamters of each block. + Args: + flat (bool): If True, return a flattened dictionary where each key + is a two elements tuple containing the name of the block as the first + element and the name of the hyperparameter as the second one. + If False (default), return a dictionary where each key is the name of + a block and each value is a dictionary containing the complete + hyperparameter specification of that block. + Returns: dict: A dictionary containing the block names as keys and the current block hyperparameters dictionary as values. """ - hyperparameters = {} + hyperparameters = dict() for block_name, block in self.blocks.items(): hyperparameters[block_name] = block.get_hyperparameters() + if flat: + hyperparameters = self._flatten_dict(hyperparameters) + return hyperparameters def set_hyperparameters(self, hyperparameters): """Set new hyperparameter values for some blocks. Args: - hyperparameters (dict): A dictionary containing the block names as - keys and the new hyperparameters dictionary - as values. + hyperparameters (dict): + A dictionary containing the block names as keys and the new hyperparameters + dictionary as values. """ + hyperparameters = self._sanitize(hyperparameters) for block_name, block_hyperparams in hyperparameters.items(): self.blocks[block_name].set_hyperparameters(block_hyperparams) def _get_block_args(self, block_name, block_args, context): + """Get the arguments expected by the block method from the context. + + The arguments will be taken from the context using both the method + arguments specification and the ``input_names`` given when the pipeline + was created. + + Args: + block_name (str): + Name of this block. Used to find the corresponding input_names. + block_args (list): + list of method argument specifications from the primitive. + context (dict): + current context dictionary. + + Returns: + dict: + A dictionary containing the argument names and values to pass + to the method. + """ # TODO: type validation and/or transformation should be done here input_names = self.input_names.get(block_name, dict()) + if isinstance(block_args, str): + block = self.blocks[block_name] + block_args = getattr(block.instance, block_args)() + kwargs = dict() for arg in block_args: name = arg['name'] - keyword = arg.get('keyword', name) variable = input_names.get(name, name) if variable in context: - value = context[variable] - - elif 'default' in arg: - value = arg['default'] - - else: - raise TypeError( - "Expected argument '{}.{}' not found in context" - .format(block_name, variable) - ) - - kwargs[keyword] = value + kwargs[name] = context[variable] return kwargs - def _get_outputs(self, block_name, outputs, block_outputs): + def _extract_outputs(self, block_name, outputs, block_outputs): + """Extract the outputs of the method as a dict to be set into the context.""" # TODO: type validation and/or transformation should be done here + if isinstance(block_outputs, str): + block = self.blocks[block_name] + block_outputs = getattr(block.instance, block_outputs)() if not isinstance(outputs, tuple): outputs = (outputs, ) @@ -188,115 +627,346 @@ def _get_outputs(self, block_name, outputs, block_outputs): return output_dict - def fit(self, X=None, y=None, **kwargs): + def _update_outputs(self, variable_name, output_variables, outputs, value): + """Set the requested block outputs into the outputs list in the right place.""" + if variable_name in output_variables: + index = output_variables.index(variable_name) + outputs[index] = deepcopy(value) + + def _fit_block(self, block, block_name, context, debug_info=None): + """Get the block args from the context and fit the block.""" + LOGGER.debug('Fitting block %s', block_name) + try: + fit_args = self._get_block_args(block_name, block.fit_args, context) + process = psutil.Process(os.getpid()) + memory_before = process.memory_info().rss + start = datetime.utcnow() + block.fit(**fit_args) + elapsed = datetime.utcnow() - start + memory_after = process.memory_info().rss + + if debug_info is not None: + debug = debug_info['debug'] + record = {} + if 't' in debug: + record['time'] = elapsed.total_seconds() + if 'm' in debug: + record['memory'] = memory_after - memory_before + if 'i' in debug: + record['input'] = deepcopy(fit_args) + + debug_info['fit'][block_name] = record + + except Exception: + if self.verbose: + LOGGER.exception('Exception caught fitting MLBlock %s', block_name) + + raise + + def _produce_block(self, block, block_name, context, output_variables, + outputs, debug_info=None): + """Get the block args from the context and produce the block. + + Afterwards, set the block outputs back into the context and update + the outputs list if necessary. + """ + LOGGER.debug('Producing block %s', block_name) + try: + produce_args = self._get_block_args(block_name, block.produce_args, context) + process = psutil.Process(os.getpid()) + memory_before = process.memory_info().rss + start = datetime.utcnow() + block_outputs = block.produce(**produce_args) + elapsed = datetime.utcnow() - start + memory_after = process.memory_info().rss + + outputs_dict = self._extract_outputs(block_name, block_outputs, block.produce_output) + context.update(outputs_dict) + + if output_variables: + if block_name in output_variables: + self._update_outputs(block_name, output_variables, outputs, context) + else: + for key, value in outputs_dict.items(): + variable_name = '{}.{}'.format(block_name, key) + self._update_outputs(variable_name, output_variables, outputs, value) + + if debug_info is not None: + debug = debug_info['debug'] + record = {} + if 't' in debug: + record['time'] = elapsed.total_seconds() + if 'm' in debug: + record['memory'] = memory_after - memory_before + if 'i' in debug: + record['input'] = deepcopy(produce_args) + if 'o' in debug: + record['output'] = deepcopy(outputs_dict) + + debug_info['produce'][block_name] = record + + except Exception: + if self.verbose: + LOGGER.exception('Exception caught producing MLBlock %s', block_name) + + raise + + def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs): """Fit the blocks of this pipeline. - Sequentially call the `fit` and the `produce` methods of each block, - capturing the outputs each `produce` method before calling the `fit` + Sequentially call the ``fit`` and the ``produce`` methods of each block, + capturing the outputs each ``produce`` method before calling the ``fit`` method of the next one. During the whole process a context dictionary is built, where both the - passed arguments and the captured outputs of the `produce` methods - are stored, and from which the arguments for the next `fit` and - `produce` calls will be taken. + passed arguments and the captured outputs of the ``produce`` methods + are stored, and from which the arguments for the next ``fit`` and + ``produce`` calls will be taken. Args: - X: Fit Data, which the pipeline will learn from. - y: Fit Data labels, which the pipeline will use to learn how to - behave. - **kwargs: Any additional keyword arguments will be directly added - to the context dictionary and available for the blocks. - """ - context = { - 'X': X, - 'y': y - } - context.update(kwargs) + X: + Fit Data, which the pipeline will learn from. + y: + Fit Data labels, which the pipeline will use to learn how to + behave. + output_ (str or int or list or None): + Output specification, as required by ``get_outputs``. If ``None`` is given, + nothing will be returned. + start_ (str or int or None): + Block index or block name to start processing from. The + value can either be an integer, which will be interpreted as a block index, + or the name of a block, including the conter number at the end. + If given, the execution of the pipeline will start on the specified block, + and all the blocks before that one will be skipped. + debug (bool or str): + Debug a pipeline with the following options: + + * ``t``: + Elapsed time for the primitive and the given stage (fit or predict). + * ``m``: + Amount of memory incrase (or decrease) for the primitive. This amount + is represented in bytes. + * ``i``: + The input values that the primitive takes for that step. + * ``o``: + The output values that the primitive generates. + + If provided, return a dictionary with the ``fit`` and ``predict`` performance. + This argument can be a string containing a combination of the letters listed above, + or ``True`` which will return a complete debug. + + **kwargs: + Any additional keyword arguments will be directly added + to the context dictionary and available for the blocks. + + Returns: + None or dict or object: + * If no ``output`` is specified, nothing will be returned. + * If ``output_`` has been specified, either a single value or a + tuple of values will be returned. + """ + context = kwargs.copy() + if X is not None: + context['X'] = X - last_block_name = list(self.blocks.keys())[-1] + if y is not None: + context['y'] = y + + if output_ is None: + output_variables = None + outputs = None + output_blocks = set() + else: + output_variables, outputs, output_blocks = self._prepare_outputs(output_) + + if isinstance(start_, int): + start_ = self._get_block_name(start_) + + debug_info = None + if debug: + debug_info = defaultdict(dict) + debug_info['debug'] = debug.lower() if isinstance(debug, str) else 'tmio' + + fit_pending = True for block_name, block in self.blocks.items(): - fit_args = self._get_block_args(block_name, block.fit_args, context) + if block_name == self._last_fit_block: + fit_pending = False - LOGGER.debug("Fitting block %s", block_name) - block.fit(**fit_args) + if start_: + if block_name == start_: + start_ = False + else: + LOGGER.debug('Skipping block %s fit', block_name) + continue + + self._fit_block(block, block_name, context, debug_info) + + if fit_pending or output_blocks: + self._produce_block( + block, block_name, context, output_variables, outputs, debug_info) - if block_name != last_block_name: - produce_args = self._get_block_args(block_name, block.produce_args, context) + # We already captured the output from this block + if block_name in output_blocks: + output_blocks.remove(block_name) - LOGGER.debug("Producing block %s", block_name) - outputs = block.produce(**produce_args) + # If there was an output_ but there are no pending + # outputs we are done. + if output_variables: + if not output_blocks: + if len(outputs) > 1: + result = tuple(outputs) + else: + result = outputs[0] - output_dict = self._get_outputs(block_name, outputs, block.produce_output) - context.update(output_dict) + if debug: + return result, debug_info - def predict(self, X=None, **kwargs): + return result + + elif not fit_pending: + if debug: + return debug_info + + return + + if start_: + # We skipped all the blocks up to the end + raise ValueError('Unknown block name: {}'.format(start_)) + + if debug: + return debug_info + + def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs): """Produce predictions using the blocks of this pipeline. - Sequentially call the `produce` method of each block, capturing the + Sequentially call the ``produce`` method of each block, capturing the outputs before calling the next one. During the whole process a context dictionary is built, where both the - passed arguments and the captured outputs of the `produce` methods - are stored, and from which the arguments for the next `produce` calls + passed arguments and the captured outputs of the ``produce`` methods + are stored, and from which the arguments for the next ``produce`` calls will be taken. Args: - X: Data which the pipeline will use to make predictions. - **kwargs: Any additional keyword arguments will be directly added - to the context dictionary and available for the blocks. + X: + Data which the pipeline will use to make predictions. + output_ (str or int or list or None): + Output specification, as required by ``get_outputs``. If not specified + the ``default`` output will be returned. + start_ (str or int or None): + Block index or block name to start processing from. The + value can either be an integer, which will be interpreted as a block index, + or the name of a block, including the conter number at the end. + If given, the execution of the pipeline will start on the specified block, + and all the blocks before that one will be skipped. + debug (bool or str): + Debug a pipeline with the following options: + + * ``t``: + Elapsed time for the primitive and the given stage (fit or predict). + * ``m``: + Amount of memory incrase (or decrease) for the primitive. This amount + is represented in bytes. + * ``i``: + The input values that the primitive takes for that step. + * ``o``: + The output values that the primitive generates. + + If ``True`` then a dictionary will be returned containing all the elements listed + previously. If a ``string`` value with the combination of letters is given for + each option, it will return a dictionary with the selected elements. + + **kwargs: + Any additional keyword arguments will be directly added + to the context dictionary and available for the blocks. + + Returns: + object or tuple: + * If a single output is requested, it is returned alone. + * If multiple outputs have been requested, a tuple is returned. + * If ``debug`` is given, a tupple will be returned where the first element + returned are the predictions and the second a dictionary containing the debug + information. """ - context = { - 'X': X - } - context.update(kwargs) + context = kwargs.copy() + if X is not None: + context['X'] = X + + output_variables, outputs, output_blocks = self._prepare_outputs(output_) + + if isinstance(start_, int): + start_ = self._get_block_name(start_) + + debug_info = None + if debug: + debug_info = defaultdict(dict) + debug_info['debug'] = debug.lower() if isinstance(debug, str) else 'tmio' - last_block_name = list(self.blocks.keys())[-1] for block_name, block in self.blocks.items(): - produce_args = self._get_block_args(block_name, block.produce_args, context) + if start_: + if block_name == start_: + start_ = False + else: + LOGGER.debug('Skipping block %s produce', block_name) + continue - LOGGER.debug("Producing block %s", block_name) - outputs = block.produce(**produce_args) + self._produce_block(block, block_name, context, output_variables, outputs, debug_info) - if block_name != last_block_name: - output_dict = self._get_outputs(block_name, outputs, block.produce_output) - context.update(output_dict) + # We already captured the output from this block + if block_name in output_blocks: + output_blocks.remove(block_name) - return outputs + # If there was an output_ but there are no pending + # outputs we are done. + if not output_blocks: + if len(outputs) > 1: + result = tuple(outputs) + else: + result = outputs[0] + + if debug: + return result, debug_info + + return result + + if start_: + # We skipped all the blocks up to the end + raise ValueError('Unknown block name: {}'.format(start_)) def to_dict(self): """Return all the details of this MLPipeline in a dict. - The dict structure contains all the `__init__` arguments of the + The dict structure contains all the ``__init__`` arguments of the MLPipeline, as well as the current hyperparameter values and the specification of the tunable_hyperparameters:: { - "primitives": [ - "a_primitive", - "another_primitive" + 'primitives': [ + 'a_primitive', + 'another_primitive' ], - "init_params": { - "a_primitive": { - "an_argument": "a_value" + 'init_params': { + 'a_primitive': { + 'an_argument': 'a_value' } }, - "hyperparameters": { - "a_primitive#1": { - "an_argument": "a_value", - "another_argument": "another_value", + 'hyperparameters': { + 'a_primitive#1': { + 'an_argument': 'a_value', + 'another_argument': 'another_value', }, - "another_primitive#1": { - "yet_another_argument": "yet_another_value" + 'another_primitive#1': { + 'yet_another_argument': 'yet_another_value' } }, - "tunable_hyperparameters": { - "another_primitive#1": { - "yet_another_argument": { - "type": "str", - "default": "a_default_value", - "values": [ - "a_default_value", - "yet_another_value" + 'tunable_hyperparameters': { + 'another_primitive#1': { + 'yet_another_argument': { + 'type': 'str', + 'default': 'a_default_value', + 'values': [ + 'a_default_value', + 'yet_another_value' ] } } @@ -309,16 +979,353 @@ def to_dict(self): 'input_names': self.input_names, 'output_names': self.output_names, 'hyperparameters': self.get_hyperparameters(), - 'tunable_hyperparameters': self._tunable_hyperparameters + 'tunable_hyperparameters': self._tunable_hyperparameters, + 'outputs': self.outputs, } + def _get_simple_block_name(self, block_name): + """ + Gets the most readable, simplest version of the block name, + without the number of the block or excess modifiers. + + Args: + block_name (str): + Name of the block whose simple name is being extracted. + + Returns: + str: + block name stripped of number and other modifiers. + """ + full_name = block_name.split('#')[0] + simple_name = full_name.split('.')[-1] + return simple_name + + def _get_context_name_from_variable(self, variable_name): + """ + Gets the name of the context from the given variable. + + Args: + variable_name (str): + Name of the variable. + + Returns: + str: + Name of the context of the variable. + """ + block_name = variable_name.split('#')[0] + rest = variable_name[len(block_name) + 1:] + block_index = rest.split('.')[0] + context_name = rest[len(block_index) + 1:] + if len(context_name) == 0: + raise ValueError('Invalid variable name') + return context_name + + def _get_relevant_output_variables(self, block_name, block, current_output_variables): + """ + Gets the output variables of the given block that are in a given set of output variables + + Args: + block_name (str): + The name of the block from which the variables are outputted + + block (MLBlock): + The block from which the variables are outputted + + current_output_variables (list): + A list of possible output variables to return + + Returns: + set: + A set of strings containing the output variable name if and only if it is an + output variable of the given block and its name is in the list of possible + output variables + """ + output_alt_names = self.output_names.get(block_name, dict()) + relevant_output = set() + for block_output in block.produce_output: + output_variable_name = block_output['name'] + if output_variable_name in output_alt_names.keys(): + output_variable_name = output_alt_names[output_variable_name] + + if output_variable_name in current_output_variables: + relevant_output.add(block_output['name']) + + return relevant_output + + def _make_diagram_block(self, diagram, block_name): + """ + Modifies the diagram to add the corresponding block of the pipeline as a visible node in + the diagram. + + Args: + diagram (Digraph): + Diagram to be modified. + + block_name (str): + Name of block to be added to the diagram + """ + simple_name = self._get_simple_block_name(block_name) + diagram.node(block_name, simple_name, penwidth='1') + + def _make_block_inputs(self, diagram, fit, block_name, block, cluster_edges, variable_blocks): + """ + Modifies the diagram to add the corresponding input variables to the corresponding block + and their edges as outputs to other blocks by modifying `variable_blocks`. Additionally + modifies a set of edges to add any edges between an alternative input name and this block. + + Args: + diagram (Digraph): + Diagram to be modified. + + fit (bool): + `True` if including fitted arguments, `False` otherwise. + + block_name (str): + Name of block whose input variables are to be added to the diagram + + block (MLBlock): + Block whose input variables are to be added to the diagram + + cluster_edges (set): + Set of tuples representing edges between alternative variable names and their + corresponding block and the type of arrowhead + + variable_blocks (dict): + Dictionary of variable names and the set of tuples of blocks into which the + variable connects and the type of arrowhead to use + """ + input_alt_names = self.input_names.get(block_name, dict()) + input_variables = set(variable['name'] for variable in block.produce_args) + + if fit: + for input_variable in block.fit_args: + if input_variable['name'] not in input_variables: + input_variables.add(input_variable['name']) + + for input_name in input_variables: + input_block = block_name + arrowhead = 'normal' + if input_name in input_alt_names: + input_variable_label = block_name + ' ' + input_name + ' (input)' + diagram.node(input_variable_label, + '(' + input_name + ')', fontcolor='blue') + cluster_edges.add((input_variable_label, block_name, 'normal')) + input_name = input_alt_names[input_name] + input_block = input_variable_label + arrowhead = 'none' + + if input_name in variable_blocks.keys(): + variable_blocks[input_name].add((input_block, arrowhead)) + else: + variable_blocks[input_name] = {(input_block, arrowhead)} + + def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges, + variable_blocks): + """ + Modifies the diagram to add the corresponding output variables to the corresponding block + and their edges as inputs to other blocks, as well as updating `variable_blocks`. + Additionally modifies a set of edges to add any edges between an alternative output name + and this block. + + Args: + diagram (Digraph): + Diagram to be modified. + + block_name (str): + Name of block whose output variables are to be added to the diagram + + output_names (set): + Set of output variable names to be added to the diagram + + cluster_edges (set): + Set of tuples representing edges between alternative variable names and their + corresponding block and the type of arrowhead + + variable_blocks (dict): + Dictionary of variable names and the set of tuples of blocks into which the + variable connects and the type of arrowhead to use + """ + output_alt_names = self.output_names.get(block_name, dict()) + for output_name in output_names: + output_block = block_name + if output_name in output_alt_names.keys(): + alt_variable_label = block_name + ' ' + output_name + ' (output)' + diagram.node(alt_variable_label, + '(' + output_name + ')', fontcolor='red') + cluster_edges.add((block_name, alt_variable_label, 'none')) + output_name = output_alt_names[output_name] + output_block = alt_variable_label + + output_variable_label = block_name + ' ' + output_name + diagram.node(output_variable_label, output_name) + diagram.edge(output_block, output_variable_label, arrowhead='none') + + for block, arrow in variable_blocks[output_name]: + diagram.edge(output_variable_label, block, arrowhead=arrow) + + del variable_blocks[output_name] + + def _make_diagram_inputs(self, diagram, input_variables_blocks): + """ + Modifies the diagram to add the inputs of the pipeline + + Args: + diagram (Digraph): + Diagram to be modified. + + input_variables_blocks (dict): + Dictionary of input variables of the pipeline and the set of tuples of blocks into + which the variable connects and the type of arrowhead to use + """ + with diagram.subgraph(name='cluster_inputs') as cluster: + cluster.attr(tooltip='Input variables') + cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0') + cluster.attr('node', penwidth='0', fontsize='20') + cluster.attr('edge', penwidth='0', arrowhead='none') + cluster.node('Input', 'Input', fontsize='14', tooltip='Input variables') + input_variables = [] + for input_name, blocks in input_variables_blocks.items(): + input_name_label = input_name + '_input' + cluster.node(input_name_label, input_name) + cluster.edge('Input', input_name_label) + input_variables.append(input_name_label) + + for block, arrow in blocks: + diagram.edge(input_name_label, block, pendwith='1', arrowhead=arrow) + + with cluster.subgraph() as input_variables_subgraph: + input_variables_subgraph.attr(None, rank='same') + for index in range(1, len(input_variables)): + input_variables_subgraph.edge(input_variables[index - 1], + input_variables[index]) + input_variables_subgraph.attr(None, rankdir='LR') + + def _make_diagram_outputs(self, diagram, outputs): + """ + Modifies the diagram to add outputs of the pipeline in order from left to right. + + Args: + diagram (Digraph): + Diagram to be modified. + + outputs (str, int, or list[str or int]): + Single or list of output specifications. + + Returns: + list[str]: + List of the human-readable names of the output variables in order + """ + output_variables = [] + outputs_vars = self.get_outputs(outputs) + + with diagram.subgraph(name='cluster_outputs') as cluster: + cluster.attr(tooltip='Output variables') + cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0') + cluster.attr('node', penwidth='0', fontsize='20') + cluster.attr('edge', penwidth='0', arrowhead='none') + cluster.node('Output', 'Output', fontsize='14', tooltip='Output variables') + for output in outputs_vars: + try: + variable_name = self._get_context_name_from_variable(output['variable']) + except ValueError: + raise NotImplementedError( + 'Can not deal with this type of output specification') + cluster.node(variable_name + '_output', variable_name) + output_variables.append(variable_name) + cluster.edge(output_variables[-1] + '_output', 'Output') + with cluster.subgraph() as output_variables_subgraph: + output_variables_subgraph.attr(None, rank='same') + for index in range(1, len(output_variables)): + output_variables_subgraph.edge(output_variables[index - 1] + '_output', + output_variables[index] + '_output') + output_variables_subgraph.attr(None, rankdir='LR') + + return output_variables + + def _make_diagram_alignment(self, diagram, cluster_edges): + """ + Modifies the diagram to add alignment edges and connect alternative names to the blocks. + + Args: + diagram (Digraph): + Diagram to be modified + + cluster_edges (set): + Set of tuples that contain alternative variable names and its + corresponding block in order + """ + with diagram.subgraph() as alignment: + alignment.attr('graph', penwidth='0') + alignment.attr('node', penwidth='0') + alignment.attr('edge', len='1', minlen='1', penwidth='1') + + for first_block, second_block, arrow in cluster_edges: + with alignment.subgraph(name='cluster_' + first_block + second_block) as cluster: + cluster.edge(first_block, second_block, arrowhead=arrow) + + def get_diagram(self, fit=True, outputs='default', image_path=None): + """ + Creates a png diagram for the pipeline, showing Pipeline Steps, + Pipeline Inputs and Outputs, and block inputs and outputs. + + If strings are given, they can either be one of the named outputs that have + been specified on the pipeline definition or a full variable specification + following the format ``{block-name}.{variable-name}``. + + Args: + fit (bool): + Optional argument to include fit arguments or not. Defaults to `True`. + + outputs (str, int, or list[str or int]): + Single or list of output specifications. + + image_path (str): + Optional argument for the location at which to save the file. + Defaults to `None`, which returns a `graphviz.Digraph` object instead of + saving the file. + + Returns: + None or `graphviz.Digraph` object: + * `graphviz.Digraph` contains the information about the Pipeline Diagram + """ + + diagram = Digraph(format='png') + diagram.attr('graph', splines='ortho') + diagram.attr(tooltip=' ') # hack to remove extraneous tooltips on edges + diagram.attr('node', shape='box', penwidth='0') + + output_variables = self._make_diagram_outputs(diagram, outputs) + + cluster_edges = set() + variable_blocks = dict((name, {(name + '_output', 'normal')}) for name in output_variables) + for block_name, block in reversed(self.blocks.items()): + relevant_output_names = self._get_relevant_output_variables(block_name, block, + variable_blocks.keys()) + if len(relevant_output_names) == 0: + continue # skip this block + + self._make_diagram_block(diagram, block_name) + self._make_block_outputs(diagram, block_name, relevant_output_names, cluster_edges, + variable_blocks) + self._make_block_inputs(diagram, fit, block_name, block, cluster_edges, + variable_blocks) + + self._make_diagram_inputs(diagram, variable_blocks) + self._make_diagram_alignment(diagram, cluster_edges) + + if image_path: + diagram.render(filename='Diagram', directory=image_path, cleanup=True, format='png') + else: + return diagram + def save(self, path): """Save the specification of this MLPipeline in a JSON file. - The content of the JSON file is the dict returned by the `to_dict` method. + The content of the JSON file is the dict returned by the ``to_dict`` method. Args: - path (str): Path to the JSON file to write. + path (str): + Path to the JSON file to write. """ with open(path, 'w') as out_file: json.dump(self.to_dict(), out_file, indent=4) @@ -327,48 +1334,44 @@ def save(self, path): def from_dict(cls, metadata): """Create a new MLPipeline from a dict specification. - The dict structure is the same as the one created by the `to_dict` method. + The dict structure is the same as the one created by the ``to_dict`` method. Args: - metadata (dict): Dictionary containing the pipeline specification. + metadata (dict): + Dictionary containing the pipeline specification. Returns: MLPipeline: A new MLPipeline instance with the details found in the given specification dictionary. """ - hyperparameters = metadata.get('hyperparameters') - tunable = metadata.get('tunable_hyperparameters') - - pipeline = cls( - metadata['primitives'], - metadata.get('init_params'), - metadata.get('input_names'), - metadata.get('output_names'), + warnings.warn( + 'MLPipeline.form_dict(pipeline_dict) is deprecated and will be removed in a ' + 'later release. Please use MLPipeline(dict) instead,', + DeprecationWarning ) - - if hyperparameters: - pipeline.set_hyperparameters(hyperparameters) - - if tunable is not None: - pipeline._tunable_hyperparameters = tunable - - return pipeline + return cls(metadata) @classmethod def load(cls, path): """Create a new MLPipeline from a JSON specification. - The JSON file format is the same as the one created by the `to_dict` method. + The JSON file format is the same as the one created by the ``to_dict`` method. Args: - path (str): Path of the JSON file to load. + path (str): + Path of the JSON file to load. Returns: MLPipeline: A new MLPipeline instance with the specification found in the JSON file. """ + warnings.warn( + 'MLPipeline.load(path) is deprecated and will be removed in a later release. ' + 'Please use MLPipeline(path) instead,', + DeprecationWarning + ) with open(path, 'r') as in_file: metadata = json.load(in_file) diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py deleted file mode 100644 index 337116e7..00000000 --- a/mlblocks/primitives.py +++ /dev/null @@ -1,78 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Primitives module. - -This module contains functions to load primitive annotations, -as well as to configure how MLBlocks finds the primitives. -""" - -import json -import os -import sys - -_PRIMITIVES_PATHS = [ - os.path.join(os.getcwd(), 'mlblocks_primitives'), - os.path.join(sys.prefix, 'mlblocks_primitives'), -] - - -def add_primitives_path(path): - """Add a new path to look for primitives. - - The new path will be inserted in the first place of the list, - so any primitive found in this new folder will take precedence - over any other primitive with the same name that existed in the - system before. - - Args: - path (str): path to add - - Raises: - ValueError: A `ValueError` will be raised if the path is not valid. - """ - if path not in _PRIMITIVES_PATHS: - if not os.path.isdir(path): - raise ValueError('Invalid path: {}'.format(path)) - - _PRIMITIVES_PATHS.insert(0, os.path.abspath(path)) - - -def get_primitives_paths(): - """Get the list of folders where the primitives will be looked for. - - Returns: - list: - The list of folders. - """ - return _PRIMITIVES_PATHS - - -def load_primitive(name): - """Locate and load the JSON annotation of the given primitive. - - All the paths found in PRIMTIVE_PATHS will be scanned to find a JSON file - with the given name, and as soon as a JSON with the given name is found it - is returned. - - Args: - name (str): name of the primitive to look for. The name should - correspond to the primitive, not to the filename, as the - `.json` extension will be added dynamically. - - Returns: - dict: - The content of the JSON annotation file loaded into a dict. - - Raises: - ValueError: A `ValueError` will be raised if the primitive cannot be - found. - """ - - for base_path in _PRIMITIVES_PATHS: - json_path = os.path.join(base_path, name + '.json') - if os.path.isfile(json_path): - with open(json_path, 'r') as json_file: - return json.load(json_file) - - raise ValueError("Unknown primitive: {}".format(name)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..d2ce3888 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +# Requirements for development and mybinder environment +-e .[dev] +docutils<0.16,>=0.10 # Fix dependency conflict on mybinder diff --git a/setup.cfg b/setup.cfg index fbc69b07..8908f680 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,18 +1,21 @@ [bumpversion] -current_version = 0.2.5-dev +current_version = 0.6.3.dev0 commit = True tag = True -parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? serialize = - {major}.{minor}.{patch}-{release} + {major}.{minor}.{patch}.{release}{candidate} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = release +first_value = dev values = dev release +[bumpversion:part:candidate] + [bumpversion:file:setup.py] search = version='{current_version}' replace = version='{new_version}' @@ -30,12 +33,15 @@ exclude = .tox, .git, __pycache__, .ipynb_checkpoints ignore = # Keep empty to prevent default ignores [isort] -include_trailing_comment = True line_length = 99 lines_between_types = 0 multi_line_output = 4 -not_skip = __init__.py use_parentheses = True +not_skip = __init__.py +skip_glob = *.bak + +[metadata] +description-file = README.md [aliases] test = pytest @@ -46,3 +52,9 @@ collect_ignore = ['setup.py'] [tool:pylint] good-names = X,y +[doc8] +max-line-length = 99 + +[pydocstyle] +add-ignore = D403,D413,D105,D107 + diff --git a/setup.py b/setup.py index 9d4b4cfc..e4ab47c9 100644 --- a/setup.py +++ b/setup.py @@ -5,23 +5,40 @@ from setuptools import find_packages, setup - -with open('README.md') as readme_file: +with open('README.md', encoding='utf-8') as readme_file: readme = readme_file.read() - -with open('HISTORY.md') as history_file: +with open('HISTORY.md', encoding='utf-8') as history_file: history = history_file.read() install_requires = [ - 'mlprimitives>=0.1.3', + 'graphviz>=0.9,<1', + 'numpy>=1.17.1,<3', + 'psutil>=5,<7', +] + + +mlprimitives_requires = [ + 'mlprimitives>=0.4.0,<0.5', + 'h5py<4,>=2.10.0', # <- tensorflow 2.3.2 conflict + 'matplotlib<4,>=2.2.2', # <- copulas 0.3.3 + 'protobuf<4', # <- importlib +] + +examples_require = mlprimitives_requires + [ + 'jupyter==1.0.0', + 'baytune>=0.5.0,<0.6', + 'copulas<0.12', ] tests_require = [ 'pytest>=3.4.2', 'pytest-cov>=2.6.0', + 'setuptools>=41.0.0', + 'rundoc>=0.4.3', + 'prompt-toolkit>=2.0,<3.0', ] @@ -32,34 +49,48 @@ development_requires = [ # general - 'bumpversion>=0.5.3', + 'bumpversion>=0.5.3,<0.6', 'pip>=9.0.1', - 'watchdog>=0.8.3', + 'watchdog>=0.8.3,<5', # docs - 'm2r>=0.2.0', - 'Sphinx>=1.7.1', - 'sphinx_rtd_theme>=0.2.4', - 'graphviz==0.9', - 'ipython==6.5.0', - 'matplotlib==2.2.3', - 'recommonmark>=0.4.0', + 'm2r>=0.2.0,<0.3', + 'Sphinx>=1.7.1,<3', + 'sphinx_rtd_theme>=0.2.4,<0.5', + 'docutils>=0.12,<0.18', + 'ipython>=6.5.0', + 'autodocsumm>=0.1.10', + 'Jinja2>=2,<3', # >=3 makes sphinx theme fail + 'markupsafe<2.1.0', + + # fails on Sphinx < v3.4 + 'alabaster<=0.7.12', + # fails on Sphins < v5.0 + 'sphinxcontrib-applehelp<1.0.8', + 'sphinxcontrib-devhelp<1.0.6', + 'sphinxcontrib-htmlhelp<2.0.5', + 'sphinxcontrib-serializinghtml<1.1.10', + 'sphinxcontrib-qthelp<1.0.7', # style check - 'flake8>=3.5.0', - 'isort>=4.3.4', + 'flake8>=3.7.7,<4', + 'isort>=4.3.4,<5', # fix style issues - 'autoflake>=1.2', # keep this after flake8 to avoid - 'autopep8>=1.3.5', # version incompatibilities with flake8 + 'autoflake>=1.1,<2', + 'autopep8>=1.4.3,<2', # distribute on PyPI - 'twine>=1.10.0', + 'twine>=1.10.0,<4', 'wheel>=0.30.0', # Advanced testing - 'tox>=2.9.1', - 'coverage>=4.5.1', + 'coverage>=4.5.1,<6', + 'tox>=2.9.1,<4', + + # Documentation style + 'doc8>=0.8.0', + 'pydocstyle>=3.0.0', ] @@ -72,26 +103,34 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'Programming Language :: Python :: 3.13', ], - description="Pipelines and primitives for machine learning and data science.", + description='Pipelines and primitives for machine learning and data science.', extras_require={ - 'dev': development_requires + tests_require, - 'test': tests_require, + 'dev': development_requires + tests_require + examples_require, + 'unit': tests_require, + 'test': tests_require + examples_require, + 'examples': examples_require, + 'mlprimitives': mlprimitives_requires, }, include_package_data=True, install_requires=install_requires, keywords='auto machine learning classification regression data science pipeline', - license="MIT license", + license='MIT license', long_description=readme + '\n\n' + history, long_description_content_type='text/markdown', name='mlblocks', packages=find_packages(include=['mlblocks', 'mlblocks.*']), + python_requires='>=3.8,<3.14', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, - url='https://github.com/HDI-Project/MLBlocks', - version='0.2.5-dev', + url='https://github.com/MLBazaar/MLBlocks', + version='0.6.3.dev0', zip_safe=False, ) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/diagrams/diagram_fit.txt b/tests/data/diagrams/diagram_fit.txt new file mode 100644 index 00000000..7939b5e3 --- /dev/null +++ b/tests/data/diagrams/diagram_fit.txt @@ -0,0 +1,40 @@ +digraph { + graph [splines=ortho] + tooltip=" " + node [penwidth=0 shape=box] + subgraph cluster_outputs { + tooltip="Output variables" + graph [bgcolor=azure3 penwidth=0 rank=source] + node [fontsize=20 penwidth=0] + edge [arrowhead=none penwidth=0] + Output [label=Output fontsize=14 tooltip="Output variables"] + output_variable_output [label=output_variable] + output_variable_output -> Output + { + rank=same + rankdir=LR + } + } + "a_primitive#1" [label=a_primitive penwidth=1] + "a_primitive#1 output_variable" [label=output_variable] + "a_primitive#1" -> "a_primitive#1 output_variable" [arrowhead=none] + "a_primitive#1 output_variable" -> output_variable_output [arrowhead=normal] + input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1] + subgraph cluster_inputs { + tooltip="Input variables" + graph [bgcolor=azure3 penwidth=0 rank=source] + node [fontsize=20 penwidth=0] + edge [arrowhead=none penwidth=0] + Input [label=Input fontsize=14 tooltip="Input variables"] + input_variable_input [label=input_variable] + Input -> input_variable_input + { + rank=same + } + } + { + graph [penwidth=0] + node [penwidth=0] + edge [len=1 minlen=1 penwidth=1] + } +} diff --git a/tests/data/diagrams/diagram_multiple_blocks.txt b/tests/data/diagrams/diagram_multiple_blocks.txt new file mode 100644 index 00000000..3f43a108 --- /dev/null +++ b/tests/data/diagrams/diagram_multiple_blocks.txt @@ -0,0 +1,44 @@ +digraph { + graph [splines=ortho] + tooltip=" " + node [penwidth=0 shape=box] + subgraph cluster_outputs { + tooltip="Output variables" + graph [bgcolor=azure3 penwidth=0 rank=source] + node [fontsize=20 penwidth=0] + edge [arrowhead=none penwidth=0] + Output [label=Output fontsize=14 tooltip="Output variables"] + output_variable_b_output [label=output_variable_b] + output_variable_b_output -> Output + { + rank=same + rankdir=LR + } + } + "b_primitive#1" [label=b_primitive penwidth=1] + "b_primitive#1 output_variable_b" [label=output_variable_b] + "b_primitive#1" -> "b_primitive#1 output_variable_b" [arrowhead=none] + "b_primitive#1 output_variable_b" -> output_variable_b_output [arrowhead=normal] + "a_primitive#1" [label=a_primitive penwidth=1] + "a_primitive#1 output_variable_a" [label=output_variable_a] + "a_primitive#1" -> "a_primitive#1 output_variable_a" [arrowhead=none] + "a_primitive#1 output_variable_a" -> "b_primitive#1" [arrowhead=normal] + input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1] + subgraph cluster_inputs { + tooltip="Input variables" + graph [bgcolor=azure3 penwidth=0 rank=source] + node [fontsize=20 penwidth=0] + edge [arrowhead=none penwidth=0] + Input [label=Input fontsize=14 tooltip="Input variables"] + input_variable_input [label=input_variable] + Input -> input_variable_input + { + rank=same + } + } + { + graph [penwidth=0] + node [penwidth=0] + edge [len=1 minlen=1 penwidth=1] + } +} diff --git a/tests/data/diagrams/diagram_simple.txt b/tests/data/diagrams/diagram_simple.txt new file mode 100644 index 00000000..7939b5e3 --- /dev/null +++ b/tests/data/diagrams/diagram_simple.txt @@ -0,0 +1,40 @@ +digraph { + graph [splines=ortho] + tooltip=" " + node [penwidth=0 shape=box] + subgraph cluster_outputs { + tooltip="Output variables" + graph [bgcolor=azure3 penwidth=0 rank=source] + node [fontsize=20 penwidth=0] + edge [arrowhead=none penwidth=0] + Output [label=Output fontsize=14 tooltip="Output variables"] + output_variable_output [label=output_variable] + output_variable_output -> Output + { + rank=same + rankdir=LR + } + } + "a_primitive#1" [label=a_primitive penwidth=1] + "a_primitive#1 output_variable" [label=output_variable] + "a_primitive#1" -> "a_primitive#1 output_variable" [arrowhead=none] + "a_primitive#1 output_variable" -> output_variable_output [arrowhead=normal] + input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1] + subgraph cluster_inputs { + tooltip="Input variables" + graph [bgcolor=azure3 penwidth=0 rank=source] + node [fontsize=20 penwidth=0] + edge [arrowhead=none penwidth=0] + Input [label=Input fontsize=14 tooltip="Input variables"] + input_variable_input [label=input_variable] + Input -> input_variable_input + { + rank=same + } + } + { + graph [penwidth=0] + node [penwidth=0] + edge [len=1 minlen=1 penwidth=1] + } +} diff --git a/tests/features/test_fit_predicr_args.py b/tests/features/test_fit_predicr_args.py new file mode 100644 index 00000000..af4c0aea --- /dev/null +++ b/tests/features/test_fit_predicr_args.py @@ -0,0 +1,42 @@ +from mlblocks.mlpipeline import MLPipeline + + +def test_fit_predict_args_in_init(): + + def add(a, b): + return a + b + + primitive = { + 'name': 'add', + 'primitive': add, + 'produce': { + 'args': [ + { + 'name': 'a', + 'type': 'float', + }, + { + 'name': 'b', + 'type': 'float', + }, + ], + 'output': [ + { + 'type': 'float', + 'name': 'out' + } + ] + } + } + + primitives = [primitive] + init_params = { + 'add': { + 'b': 10 + } + } + pipeline = MLPipeline(primitives, init_params=init_params) + + out = pipeline.predict(a=3) + + assert out == 13 diff --git a/tests/features/test_partial_outputs.py b/tests/features/test_partial_outputs.py new file mode 100644 index 00000000..50739cea --- /dev/null +++ b/tests/features/test_partial_outputs.py @@ -0,0 +1,144 @@ +from unittest import TestCase +from unittest.mock import Mock + +import numpy as np + +from mlblocks.mlpipeline import MLPipeline + + +def almost_equal(obj1, obj2): + if isinstance(obj1, dict): + if not isinstance(obj2, dict): + raise AssertionError("{} is not equal to {}".format(type(obj2), dict)) + + for key, value in obj1.items(): + if key not in obj2: + raise AssertionError("{} not in {}".format(key, obj2)) + + almost_equal(value, obj2[key]) + + else: + np.testing.assert_almost_equal(obj1, obj2) + + +class TestPartialOutputs(TestCase): + def setUp(self): + self.X = np.array([ + [1, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 0, 1, 0, 0], + [0, 0, 0, 1, 0], + [0, 0, 0, 0, 1], + ]) + self.y = np.array([0, 0, 0, 0, 1]) + + def test_fit_output(self): + + # Setup variables + primitives = [ + 'sklearn.preprocessing.StandardScaler', + 'sklearn.linear_model.LogisticRegression' + ] + pipeline = MLPipeline(primitives) + + named = 'default' + list_ = ['default', 0] + int_block = 0 + invalid_int = 10 + str_block = 'sklearn.preprocessing.StandardScaler#1' + invalid_block = 'InvalidBlockName' + str_block_variable = 'sklearn.preprocessing.StandardScaler#1.X' + invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid' + + # Run + named_out = pipeline.fit(self.X, self.y, output_=named) + list_out = pipeline.fit(self.X, self.y, output_=list_) + int_out = pipeline.fit(self.X, self.y, output_=int_block) + str_out = pipeline.fit(self.X, self.y, output_=str_block) + str_out_variable = pipeline.fit(self.X, self.y, + output_=str_block_variable) + no_output = pipeline.fit(self.X, self.y) + + # Assert successful calls + X = np.array([ + [2., -0.5, -0.5, -0.5, -0.5], + [-0.5, 2., -0.5, -0.5, -0.5], + [-0.5, -0.5, 2., -0.5, -0.5], + [-0.5, -0.5, -0.5, 2., -0.5], + [-0.5, -0.5, -0.5, -0.5, 2.], + ]) + y = np.array([ + 0, 0, 0, 0, 1 + ]) + context = {'X': X, 'y': y} + + almost_equal(named_out, y) + assert len(list_out) == 2 + almost_equal(list_out[0], y) + almost_equal(list_out[1], context) + almost_equal(context, int_out) + almost_equal(context, str_out) + almost_equal(X, str_out_variable) + assert no_output is None + + # Run asserting exceptions + with self.assertRaises(IndexError): + pipeline.fit(self.X, self.y, output_=invalid_int) + + with self.assertRaises(ValueError): + pipeline.fit(self.X, self.y, output_=invalid_block) + + with self.assertRaises(ValueError): + pipeline.fit(self.X, self.y, output_=invalid_variable) + + def test_fit_start(self): + # Setup variables + primitives = [ + 'sklearn.preprocessing.StandardScaler', + 'sklearn.linear_model.LogisticRegression' + ] + pipeline = MLPipeline(primitives) + + # Mock the first block + block_mock = Mock() + pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock + + # Run first block + context = { + 'X': self.X, + 'y': self.y + } + int_start = 1 + str_start = 'sklearn.linear_model.LogisticRegression#1' + + pipeline.fit(start_=int_start, **context) + pipeline.fit(start_=str_start, **context) + + # Assert that mock has not been called + block_mock.fit.assert_not_called() + + def test_predict_start(self): + # Setup variables + primitives = [ + 'sklearn.preprocessing.StandardScaler', + 'sklearn.linear_model.LogisticRegression' + ] + pipeline = MLPipeline(primitives) + pipeline.fit(self.X, self.y) + + # Mock the first block + block_mock = Mock() + pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock + + # Run first block + context = { + 'X': self.X, + } + int_start = 1 + str_start = 'sklearn.linear_model.LogisticRegression#1' + + pipeline.predict(start_=int_start, **context) + pipeline.predict(start_=str_start, **context) + + # Assert that mock has not been called + block_mock.predict.assert_not_called() diff --git a/tests/features/test_pipeline_loading.py b/tests/features/test_pipeline_loading.py new file mode 100644 index 00000000..4b363d07 --- /dev/null +++ b/tests/features/test_pipeline_loading.py @@ -0,0 +1,105 @@ +from unittest import TestCase + +from mlblocks import MLPipeline + + +class TestMLPipeline(TestCase): + + def test_dict(self): + pipeline_dict = { + 'primitives': [ + 'sklearn.ensemble.RandomForestClassifier' + ], + 'init_params': { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + }, + 'input_names': { + 'sklearn.ensemble.RandomForest#1': { + 'X': 'X1' + } + }, + 'output_names': { + 'sklearn.ensemble.RandomForest#1': { + 'y': 'y1' + } + } + } + + pipeline = MLPipeline(pipeline_dict) + + assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier'] + assert pipeline.init_params == { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + } + assert pipeline.input_names == { + 'sklearn.ensemble.RandomForest#1': { + 'X': 'X1' + } + } + assert pipeline.output_names == { + 'sklearn.ensemble.RandomForest#1': { + 'y': 'y1' + } + } + + def test_list(self): + primitives = [ + 'sklearn.ensemble.RandomForestClassifier' + ] + init_params = { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + } + + pipeline = MLPipeline(primitives, init_params=init_params) + + assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier'] + assert pipeline.init_params == { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + } + + def test_none(self): + primitives = [ + 'sklearn.ensemble.RandomForestClassifier' + ] + init_params = { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + } + + pipeline = MLPipeline(primitives=primitives, init_params=init_params) + + assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier'] + assert pipeline.init_params == { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + } + + def test_mlpipeline(self): + primitives = [ + 'sklearn.ensemble.RandomForestClassifier' + ] + init_params = { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + } + + pipeline = MLPipeline(primitives=primitives, init_params=init_params) + pipeline2 = MLPipeline(pipeline) + + assert pipeline2.primitives == ['sklearn.ensemble.RandomForestClassifier'] + assert pipeline2.init_params == { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + } diff --git a/tests/test_datasets.py b/tests/test_datasets.py deleted file mode 100644 index 174a85d6..00000000 --- a/tests/test_datasets.py +++ /dev/null @@ -1,58 +0,0 @@ -# -*- coding: utf-8 -*- - -from unittest import TestCase -from unittest.mock import Mock - -from mlblocks import datasets - - -class TestDataset(TestCase): - - def setUp(self): - self.description = """Dataset Name. - - Some extended description. - """ - self.score = Mock() - self.score.return_value = 1.0 - - self.dataset = datasets.Dataset( - self.description, 'data', 'target', self.score, - shuffle=False, stratify=True, some='kwargs') - - def test___init__(self): - - assert self.dataset.name == 'Dataset Name.' - assert self.dataset.description == self.description - assert self.dataset.data == 'data' - assert self.dataset.target == 'target' - assert self.dataset._shuffle is False - assert self.dataset._stratify is True - assert self.dataset._score == self.score - assert self.dataset.some == 'kwargs' - - def test_score(self): - returned = self.dataset.score('a', b='c') - - assert returned == 1.0 - self.score.assert_called_once_with('a', b='c') - - def test___repr__(self): - repr_ = str(self.dataset) - - assert repr_ == "Dataset Name." - - -def test_dataset_describe(capsys): - """Tested here because fixtures are not supported in TestCases.""" - - description = """Dataset Name. - - Some extended description. - """ - - dataset = datasets.Dataset(description, 'data', 'target', 'score') - dataset.describe() - - captured = capsys.readouterr() - assert captured.out == description + '\n' diff --git a/tests/test_discovery.py b/tests/test_discovery.py new file mode 100644 index 00000000..25e6e444 --- /dev/null +++ b/tests/test_discovery.py @@ -0,0 +1,408 @@ +# -*- coding: utf-8 -*- + +import json +import os +import tempfile +import uuid +from unittest.mock import Mock, call, patch + +import pytest +from pkg_resources import Distribution, EntryPoint + +from mlblocks import discovery + +FAKE_PRIMITIVES_PATH = 'this/is/a/fake' +FAKE_PRIMITIVES_PATHS = [ + 'this/is/another/fake', + 'this/is/yet/another/fake', +] + + +def test__add_lookup_path_do_nothing(): + paths = ['a', 'b'] + discovery._add_lookup_path('a', paths) + + assert paths == ['a', 'b'] + + +def test__add_lookup_path_exception(): + paths = ['a', 'b'] + invalid_path = str(uuid.uuid4()) + + with pytest.raises(ValueError): + discovery._add_lookup_path(invalid_path, paths) + + +def test__add_lookup_path(): + paths = ['a', 'b'] + discovery._add_lookup_path('tests', paths) + + expected_path = os.path.abspath('tests') + + assert paths == [expected_path, 'a', 'b'] + + +@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) +def test_add_primitives_path(): + discovery.add_primitives_path(os.path.abspath('tests')) + + expected_path = os.path.abspath('tests') + assert discovery._PRIMITIVES_PATHS == [expected_path, 'a', 'b'] + + +@patch('mlblocks.discovery._PIPELINES_PATHS', new=['a', 'b']) +def test_add_pipelines_path(): + discovery.add_pipelines_path('tests') + + expected_path = os.path.abspath('tests') + assert discovery._PIPELINES_PATHS == [expected_path, 'a', 'b'] + + +@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) +@patch('mlblocks.discovery.pkg_resources.iter_entry_points') +def test__load_entry_points_no_entry_points(iep_mock): + # setup + iep_mock.return_value == [] + + # run + paths = discovery._load_entry_points('jsons_path', 'mlprimitives') + + # assert + assert paths == [] + expected_calls = [ + call('mlprimitives'), + ] + assert iep_mock.call_args_list == expected_calls + + +@patch('mlblocks.discovery.pkg_resources.iter_entry_points') +def test__load_entry_points_entry_points(iep_mock): + # setup + something_else_ep = EntryPoint('something_else', 'mlblocks.__version__') + primitives_ep = EntryPoint( + 'primitives', + 'tests.test_discovery', + attrs=['FAKE_PRIMITIVES_PATH'], + dist=Distribution() + ) + another_primitives_ep = EntryPoint( + 'primitives', + 'tests.test_discovery', + attrs=['FAKE_PRIMITIVES_PATHS'], + dist=Distribution() + ) + iep_mock.return_value = [ + something_else_ep, + primitives_ep, + another_primitives_ep + ] + + # run + paths = discovery._load_entry_points('primitives') + + # assert + expected = [ + 'this/is/a/fake', + 'this/is/another/fake', + 'this/is/yet/another/fake', + ] + assert paths == expected + + expected_calls = [ + call('mlblocks'), + ] + assert iep_mock.call_args_list == expected_calls + + +@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) +@patch('mlblocks.discovery._load_entry_points') +def test_get_primitives_paths(lep_mock): + lep_mock.side_effect = [['c'], []] + + paths = discovery.get_primitives_paths() + + assert paths == ['a', 'b', 'c'] + expected_calls = [ + call('primitives'), + call('jsons_path', 'mlprimitives'), + ] + assert lep_mock.call_args_list == expected_calls + + +@patch('mlblocks.discovery._PIPELINES_PATHS', new=['a', 'b']) +@patch('mlblocks.discovery._load_entry_points') +def test_get_pipelines_paths(lep_mock): + lep_mock.return_value = ['c'] + + paths = discovery.get_pipelines_paths() + + assert paths == ['a', 'b', 'c'] + lep_mock.assert_called_once_with('pipelines') + + +def test__load_value_error(): + primitive = discovery._load('invalid.primitive', ['a', 'b']) + + assert primitive is None + + +def test__load_success(): + primitive = { + 'name': 'temp.primitive', + 'primitive': 'temp.primitive' + } + + with tempfile.TemporaryDirectory() as tempdir: + paths = [tempdir] + primitive_path = os.path.join(tempdir, 'temp.primitive.json') + with open(primitive_path, 'w') as primitive_file: + json.dump(primitive, primitive_file, indent=4) + + loaded = discovery._load('temp.primitive', paths) + + assert primitive == loaded + + +def test__load_json_path(): + primitive = { + 'name': 'temp.primitive', + 'primitive': 'temp.primitive' + } + + with tempfile.TemporaryDirectory() as tempdir: + paths = [tempdir] + primitive_path = os.path.join(tempdir, 'temp.primitive.json') + with open(primitive_path, 'w') as primitive_file: + json.dump(primitive, primitive_file, indent=4) + + loaded = discovery._load(primitive_path, paths) + + assert primitive == loaded + + +@patch('mlblocks.discovery.get_primitives_paths') +@patch('mlblocks.discovery._load') +def test__load_primitive_value_error(load_mock, gpp_mock): + load_mock.return_value = None + gpp_mock.return_value = ['a', 'b'] + + with pytest.raises(ValueError): + discovery.load_primitive('invalid.primitive') + + load_mock.assert_called_once_with('invalid.primitive', ['a', 'b']) + + +@patch('mlblocks.discovery.get_primitives_paths') +@patch('mlblocks.discovery._load') +def test__load_primitive_success(load_mock, gpp_mock): + gpp_mock.return_value = ['a', 'b'] + + primitive = discovery.load_primitive('valid.primitive') + + load_mock.assert_called_once_with('valid.primitive', ['a', 'b']) + + assert primitive == load_mock.return_value + + +@patch('mlblocks.discovery.get_pipelines_paths') +@patch('mlblocks.discovery._load') +def test__load_pipeline_value_error(load_mock, gpp_mock): + load_mock.return_value = None + gpp_mock.return_value = ['a', 'b'] + + with pytest.raises(ValueError): + discovery.load_pipeline('invalid.pipeline') + + load_mock.assert_called_once_with('invalid.pipeline', ['a', 'b']) + + +@patch('mlblocks.discovery.get_pipelines_paths') +@patch('mlblocks.discovery._load') +def test__load_pipeline_success(load_mock, gpp_mock): + gpp_mock.return_value = ['a', 'b'] + + pipeline = discovery.load_pipeline('valid.pipeline') + + load_mock.assert_called_once_with('valid.pipeline', ['a', 'b']) + + assert pipeline == load_mock.return_value + + +@patch('mlblocks.discovery.os') +def test__search_annotations(os_mock): + os_mock.path.abspath = os.path.abspath + os_mock.path.join = os.path.join + os_mock.path.exists.return_value = True + os_mock.listdir.side_effect = [ + [ + 'a.primitive.json', + 'another.primitive.json', + 'some', + ], + [ + 'other', + ], + [ + 'primitive.json' + ] + ] + os_mock.path.isdir.return_value = False + os_mock.path.isdir.side_effect = [ + False, + False, + True, + True, + False + ] + + annotations = discovery._search_annotations('/path/to', 'other') + + assert annotations == { + '/path/to/another.primitive.json': 'another.primitive', + '/path/to/some/other/primitive.json': 'some.other.primitive' + } + + +def test__match_no_match(): + annotation = { + 'name': 'a.primitive', + } + + matches = discovery._match(annotation, 'key', 'value') + + assert not matches + + +def test__match_root(): + annotation = { + 'name': 'a.primitive', + 'key': 'value' + } + + matches = discovery._match(annotation, 'key', 'value') + + assert matches + + +def test__match_sublevel(): + annotation = { + 'name': 'a.primitive', + 'some': { + 'sublevel': { + 'key': 'value' + } + } + } + + matches = discovery._match(annotation, 'some.sublevel.key', 'value') + + assert matches + + +def test__match_list_no_match(): + annotation = { + 'name': 'a.primitive', + 'key': [ + 'another_value' + 'yet_another_value' + ] + } + + matches = discovery._match(annotation, 'key', 'value') + + assert not matches + + +def test__match_list(): + annotation = { + 'name': 'a.primitive', + 'key': [ + 'value', + 'another_value' + ] + } + + matches = discovery._match(annotation, 'key', 'value') + + assert matches + + +def test__match_dict(): + annotation = { + 'name': 'a.primitive', + 'key': { + 'value': 'subvalue', + 'another_value': 'another_subvalue' + } + } + + matches = discovery._match(annotation, 'key', 'value') + + assert matches + + +def test__match_multiple_keys(): + annotation = { + 'name': 'a.primitive', + 'key': 'value' + } + + matches = discovery._match(annotation, 'key', ['value', 'another_value']) + + assert matches + + +@patch('mlblocks.discovery._search_annotations') +def test__find_annotations(search_annotations_mock): + search_annotations_mock.return_value = { + '/path/to/a/classifier.primitive.json': 'classifier.primitive', + '/path/to/a/regressor.primitive.json': 'regressor.primitive', + } + + loader = Mock() + loader.side_effect = [ + { + 'name': 'classifier.primitive', + 'classifiers': { + 'type': 'estimator', + 'subtype': 'classifier', + } + }, + { + 'name': 'regressor.primitive', + 'classifiers': { + 'type': 'estimator', + 'subtype': 'regressor', + } + } + ] + + filters = { + 'classifiers.subtype': 'regressor' + } + annotations = discovery._find_annotations(['/a/path'], loader, 'pattern', filters) + + assert annotations == ['regressor.primitive'] + search_annotations_mock.assert_called_once_with('/a/path', 'pattern') + + +@patch('mlblocks.discovery._find_annotations') +@patch('mlblocks.discovery.get_primitives_paths') +def test_find_primitives(gpp_mock, fa_mock): + primitives = discovery.find_primitives('pattern') + + fa_mock.assert_called_once_with( + gpp_mock.return_value, discovery.load_primitive, 'pattern', dict()) + + assert primitives == fa_mock.return_value + + +@patch('mlblocks.discovery._find_annotations') +@patch('mlblocks.discovery.get_pipelines_paths') +def test_find_pipelines(gpp_mock, fa_mock): + primitives = discovery.find_pipelines('pattern', {'a': 'filter'}) + + fa_mock.assert_called_once_with( + gpp_mock.return_value, discovery.load_pipeline, 'pattern', {'a': 'filter'}) + + assert primitives == fa_mock.return_value diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py index abc235b0..93adb0dd 100644 --- a/tests/test_mlblock.py +++ b/tests/test_mlblock.py @@ -1,21 +1,46 @@ # -*- coding: utf-8 -*- from unittest import TestCase -from unittest.mock import patch +from unittest.mock import MagicMock, Mock, patch -from mlblocks.mlblock import MLBlock, import_object +import pytest -# import pytest +from mlblocks.mlblock import MLBlock, import_object class DummyClass: + def a_method(self): + pass + + +def dummy_function(): pass -def test_import_object(): - dummy_class = import_object(__name__ + '.DummyClass') +class TestImportObject(TestCase): + + def test_class(self): + imported = import_object(__name__ + '.DummyClass') + + assert imported is DummyClass - assert dummy_class is DummyClass + def test_class_method(self): + imported = import_object(__name__ + '.DummyClass.a_method') + + assert imported is DummyClass.a_method + + def test_function(self): + imported = import_object(__name__ + '.dummy_function') + + assert imported is dummy_function + + def test_bad_object_name(self): + with pytest.raises(AttributeError): + import_object(__name__ + '.InvalidName') + + def test_bad_module(self): + with pytest.raises(ImportError): + import_object('an.invalid.module') class TestMLBlock(TestCase): @@ -23,11 +48,307 @@ class TestMLBlock(TestCase): def test__extract_params(self): pass + def test__get_tunable_no_conditionals(self): + """If there are no conditionals, tunables are returned unmodified.""" + + # setup + init_params = { + 'an_init_param': 'a_value' + } + hyperparameters = { + 'tunable': { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1, + 'range': [1, 10] + } + } + } + + # run + tunable = MLBlock._get_tunable(hyperparameters, init_params) + + # assert + expected = { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1, + 'range': [1, 10] + } + } + assert tunable == expected + + def test__get_tunable_no_condition(self): + """If there is a conditional but no condition, the default is used.""" + + # setup + init_params = { + 'an_init_param': 'a_value' + } + hyperparameters = { + 'tunable': { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1, + 'range': [1, 10] + }, + 'this_is_conditional': { + 'type': 'conditional', + 'condition': 'a_condition', + 'default': { + 'type': 'float', + 'default': 0.1, + 'values': [0, 1] + }, + 'values': { + 'not_a_match': { + 'type': 'str', + 'default': 'a', + 'values': ['a', 'b'] + }, + 'neither_a_match': { + 'type': 'int', + 'default': 0, + 'range': [1, 10] + } + } + } + } + } + + # run + tunable = MLBlock._get_tunable(hyperparameters, init_params) + + # assert + expected = { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1, + 'range': [1, 10] + }, + 'this_is_conditional': { + 'type': 'float', + 'default': 0.1, + 'values': [0, 1] + } + } + assert tunable == expected + + def test__get_tunable_condition_match(self): + """If there is a conditional and it matches, only that part is returned.""" + + # setup + init_params = { + 'a_condition': 'a_match' + } + hyperparameters = { + 'tunable': { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1, + 'range': [1, 10] + }, + 'this_is_conditional': { + 'type': 'conditional', + 'condition': 'a_condition', + 'default': { + 'type': 'float', + 'default': 0.1, + 'values': [0, 1] + }, + 'values': { + 'not_a_match': { + 'type': 'str', + 'default': 'a', + 'values': ['a', 'b'] + }, + 'a_match': { + 'type': 'int', + 'default': 0, + 'range': [1, 10] + } + } + } + } + } + + # run + tunable = MLBlock._get_tunable(hyperparameters, init_params) + + # assert + expected = { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1, + 'range': [1, 10] + }, + 'this_is_conditional': { + 'type': 'int', + 'default': 0, + 'range': [1, 10] + } + } + assert tunable == expected + + def test__get_tunable_condition_no_match(self): + """If there is a conditional and it does not match, the default is used.""" + + # setup + init_params = { + 'a_condition': 'not_a_match' + } + hyperparameters = { + 'tunable': { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1, + 'range': [1, 10] + }, + 'this_is_conditional': { + 'type': 'conditional', + 'condition': 'a_condition', + 'default': { + 'type': 'float', + 'default': 0.1, + 'values': [0, 1] + }, + 'values': { + 'also_not_a_match': { + 'type': 'str', + 'default': 'a', + 'values': ['a', 'b'] + }, + 'neither_a_match': { + 'type': 'int', + 'default': 0, + 'range': [1, 10] + } + } + } + } + } + + # run + tunable = MLBlock._get_tunable(hyperparameters, init_params) + + # assert + expected = { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1, + 'range': [1, 10] + }, + 'this_is_conditional': { + 'type': 'float', + 'default': 0.1, + 'values': [0, 1] + } + } + assert tunable == expected + + def test__get_tunable_condition_default_null(self): + """If there is no match and default is null (None), this param is not included.""" + + # setup + init_params = { + 'a_condition': 'not_a_match' + } + hyperparameters = { + 'tunable': { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1, + 'range': [1, 10] + }, + 'this_is_conditional': { + 'type': 'conditional', + 'condition': 'a_condition', + 'default': None, + 'values': { + 'also_not_a_match': { + 'type': 'str', + 'default': 'a', + 'values': ['a', 'b'] + }, + 'neither_a_match': { + 'type': 'int', + 'default': 0, + 'range': [1, 10] + } + } + } + } + } + + # run + tunable = MLBlock._get_tunable(hyperparameters, init_params) + + # assert + expected = { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1, + 'range': [1, 10] + } + } + assert tunable == expected + + def test__get_tunable_condition_match_null(self): + """If there is a match and it is null (None), this param is not included. + + This stands even if the default is not null. + """ + + # setup + init_params = { + 'a_condition': 'a_match' + } + hyperparameters = { + 'tunable': { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1, + 'range': [1, 10] + }, + 'this_is_conditional': { + 'type': 'conditional', + 'condition': 'a_condition', + 'default': { + 'type': 'float', + 'default': 0.1, + 'values': [0, 1] + }, + 'values': { + 'not_a_match': { + 'type': 'str', + 'default': 'a', + 'values': ['a', 'b'] + }, + 'a_match': None + } + } + } + } + + # run + tunable = MLBlock._get_tunable(hyperparameters, init_params) + + # assert + expected = { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1, + 'range': [1, 10] + } + } + assert tunable == expected + @patch('mlblocks.mlblock.MLBlock.set_hyperparameters') @patch('mlblocks.mlblock.import_object') @patch('mlblocks.mlblock.load_primitive') def test___init__(self, load_primitive_mock, import_object_mock, set_hps_mock): load_primitive_mock.return_value = { + 'name': 'a_primitive_name', 'primitive': 'a_primitive_name', 'produce': { 'args': [ @@ -40,9 +361,22 @@ def test___init__(self, load_primitive_mock, import_object_mock, set_hps_mock): } } - mlblock = MLBlock('given_primitive_name', argument='value') + mlblock = MLBlock('a_primitive_name', argument='value') - assert mlblock.name == 'given_primitive_name' + assert mlblock.metadata == { + 'name': 'a_primitive_name', + 'primitive': 'a_primitive_name', + 'produce': { + 'args': [ + { + 'name': 'argument' + } + ], + 'output': [ + ] + } + } + assert mlblock.name == 'a_primitive_name' assert mlblock.primitive == import_object_mock.return_value assert mlblock._fit == dict() assert mlblock.fit_args == list() @@ -75,6 +409,7 @@ def test___init__(self, load_primitive_mock, import_object_mock, set_hps_mock): @patch('mlblocks.mlblock.load_primitive') def test___str__(self, load_primitive_mock, import_object_mock): load_primitive_mock.return_value = { + 'name': 'a_primitive_name', 'primitive': 'a_primitive_name', 'produce': { 'args': [], @@ -82,15 +417,16 @@ def test___str__(self, load_primitive_mock, import_object_mock): } } - mlblock = MLBlock('given_primitive_name') + mlblock = MLBlock('a_primitive_name') - assert str(mlblock) == 'MLBlock - given_primitive_name' + assert str(mlblock) == 'MLBlock - a_primitive_name' @patch('mlblocks.mlblock.import_object') @patch('mlblocks.mlblock.load_primitive') def test_get_tunable_hyperparameters(self, load_primitive_mock, import_object_mock): """get_tunable_hyperparameters has to return a copy of the _tunables attribute.""" load_primitive_mock.return_value = { + 'name': 'a_primitive_name', 'primitive': 'a_primitive_name', 'produce': { 'args': [], @@ -108,27 +444,54 @@ def test_get_tunable_hyperparameters(self, load_primitive_mock, import_object_mo assert returned == tunable assert returned is not tunable + @patch('mlblocks.mlblock.import_object', new=Mock()) + @patch('mlblocks.mlblock.load_primitive', new=MagicMock()) + def test_get_hyperparameters(self): + """get_hyperparameters has to return a deepcopy of the _hyperparameters attribute.""" + mlblock = MLBlock('given_primitive_name') + + hyperparameters = { + 'a_list_param': ['a'] + } + mlblock._hyperparameters = hyperparameters + + returned = mlblock.get_hyperparameters() + + assert returned == hyperparameters + assert returned is not hyperparameters + + returned['a_list_param'].append('b') + assert 'b' not in hyperparameters['a_list_param'] + @patch('mlblocks.mlblock.import_object') @patch('mlblocks.mlblock.load_primitive') - def test_get_hyperparameters(self, load_primitive_mock, import_object_mock): - """get_hyperparameters has to return a copy of the _hyperparameters attribute.""" - load_primitive_mock.return_value = { - 'primitive': 'a_primitive_name', + def test_modify_hyperparameters(self, lp_mock, io_mock): + """If a primitive method modifies the hyperparameters, changes should not persist.""" + + def primitive(a_list_param): + a_list_param.append('b') + + io_mock.return_value = primitive + + lp_mock.return_value = { + 'name': 'a_primitive', + 'primitive': 'a_primitive', 'produce': { 'args': [], 'output': [] } } - mlblock = MLBlock('given_primitive_name') + mlblock = MLBlock('a_primitive') - hyperparameters = dict() + hyperparameters = { + 'a_list_param': ['a'] + } mlblock._hyperparameters = hyperparameters - returned = mlblock.get_hyperparameters() + mlblock.produce() - assert returned == hyperparameters - assert returned is not hyperparameters + assert 'b' not in hyperparameters['a_list_param'] def test_set_hyperparameters_function(self): pass diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index 2fa6d097..084eac3d 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -2,17 +2,36 @@ from collections import OrderedDict from unittest import TestCase -from unittest.mock import Mock, call, patch +from unittest.mock import MagicMock, call, patch +import pytest + +from mlblocks.mlblock import MLBlock from mlblocks.mlpipeline import MLPipeline +def get_mlblock_mock(*args, **kwargs): + return MagicMock(autospec=MLBlock) + + class TestMLPipline(TestCase): @patch('mlblocks.mlpipeline.LOGGER') @patch('mlblocks.mlpipeline.MLBlock') def test___init__(self, mlblock_mock, logger_mock): - blocks = [Mock(), Mock(), Mock(), Mock()] + blocks = [ + get_mlblock_mock(), + get_mlblock_mock(), + get_mlblock_mock(), + get_mlblock_mock() + ] + last_block = blocks[-1] + last_block.produce_output = [ + { + 'name': 'y', + 'type': 'array' + } + ] mlblock_mock.side_effect = blocks primitives = [ @@ -39,7 +58,11 @@ def test___init__(self, mlblock_mock, logger_mock): } expected_input_names = input_names.copy() - mlpipeline = MLPipeline(primitives, init_params, input_names) + mlpipeline = MLPipeline( + primitives=primitives, + init_params=init_params, + input_names=input_names + ) assert mlpipeline.primitives == expected_primitives assert mlpipeline.init_params == expected_init_params @@ -57,6 +80,16 @@ def test___init__(self, mlblock_mock, logger_mock): 'another.primitive.Name#1': blocks[2].get_tunable_hyperparameters.return_value, 'another.primitive.Name#2': blocks[3].get_tunable_hyperparameters.return_value } + assert mlpipeline.outputs == { + 'default': [ + { + 'name': 'y', + 'type': 'array', + 'variable': 'another.primitive.Name#2.y' + } + ] + } + assert mlpipeline.verbose expected_calls = [ call('a.primitive.Name', an_argument='value'), @@ -71,8 +104,9 @@ def test___init__(self, mlblock_mock, logger_mock): 'a.primitive.Name' ) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_tunable_hyperparameters(self): - mlpipeline = MLPipeline(list()) + mlpipeline = MLPipeline(['a_primitive']) tunable = dict() mlpipeline._tunable_hyperparameters = tunable @@ -81,33 +115,131 @@ def test_get_tunable_hyperparameters(self): assert returned == tunable assert returned is not tunable + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_tunable_hyperparameters_flat(self): + mlpipeline = MLPipeline(['a_primitive']) + mlpipeline._tunable_hyperparameters = { + 'block_1': { + 'hp_1': { + 'type': 'int', + 'range': [ + 1, + 10 + ], + } + }, + 'block_2': { + 'hp_1': { + 'type': 'str', + 'default': 'a', + 'values': [ + 'a', + 'b', + 'c' + ], + }, + 'hp_2': { + 'type': 'bool', + 'default': True, + } + } + } + + returned = mlpipeline.get_tunable_hyperparameters(flat=True) + + expected = { + ('block_1', 'hp_1'): { + 'type': 'int', + 'range': [ + 1, + 10 + ], + }, + ('block_2', 'hp_1'): { + 'type': 'str', + 'default': 'a', + 'values': [ + 'a', + 'b', + 'c' + ], + }, + ('block_2', 'hp_2'): { + 'type': 'bool', + 'default': True, + } + } + assert returned == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_hyperparameters(self): - block_1 = Mock() - block_2 = Mock() + block_1 = get_mlblock_mock() + block_1.get_hyperparameters.return_value = { + 'a': 'a' + } + block_2 = get_mlblock_mock() + block_2.get_hyperparameters.return_value = { + 'b': 'b', + 'c': 'c', + } blocks = OrderedDict(( ('a.primitive.Name#1', block_1), ('a.primitive.Name#2', block_2), )) - mlpipeline = MLPipeline(list()) + mlpipeline = MLPipeline(['a_primitive']) mlpipeline.blocks = blocks hyperparameters = mlpipeline.get_hyperparameters() assert hyperparameters == { - 'a.primitive.Name#1': block_1.get_hyperparameters.return_value, - 'a.primitive.Name#2': block_2.get_hyperparameters.return_value, + 'a.primitive.Name#1': { + 'a': 'a', + }, + 'a.primitive.Name#2': { + 'b': 'b', + 'c': 'c', + }, + } + block_1.get_hyperparameters.assert_called_once_with() + block_2.get_hyperparameters.assert_called_once_with() + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_hyperparameters_flat(self): + block_1 = get_mlblock_mock() + block_1.get_hyperparameters.return_value = { + 'a': 'a' + } + block_2 = get_mlblock_mock() + block_2.get_hyperparameters.return_value = { + 'b': 'b', + 'c': 'c', + } + blocks = OrderedDict(( + ('a.primitive.Name#1', block_1), + ('a.primitive.Name#2', block_2), + )) + mlpipeline = MLPipeline(['a_primitive']) + mlpipeline.blocks = blocks + + hyperparameters = mlpipeline.get_hyperparameters(flat=True) + + assert hyperparameters == { + ('a.primitive.Name#1', 'a'): 'a', + ('a.primitive.Name#2', 'b'): 'b', + ('a.primitive.Name#2', 'c'): 'c', } block_1.get_hyperparameters.assert_called_once_with() block_2.get_hyperparameters.assert_called_once_with() + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_set_hyperparameters(self): - block_1 = Mock() - block_2 = Mock() + block_1 = get_mlblock_mock() + block_2 = get_mlblock_mock() blocks = OrderedDict(( ('a.primitive.Name#1', block_1), ('a.primitive.Name#2', block_2), )) - mlpipeline = MLPipeline(list()) + mlpipeline = MLPipeline(['a_primitive']) mlpipeline.blocks = blocks hyperparameters = { @@ -120,11 +252,944 @@ def test_set_hyperparameters(self): block_1.set_hyperparameters.assert_not_called() block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'}) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_set_hyperparameters_flat(self): + block_1 = get_mlblock_mock() + block_2 = get_mlblock_mock() + blocks = OrderedDict(( + ('a.primitive.Name#1', block_1), + ('a.primitive.Name#2', block_2), + )) + mlpipeline = MLPipeline(['a_primitive']) + mlpipeline.blocks = blocks + + hyperparameters = { + ('a.primitive.Name#2', 'some'): 'arg' + } + mlpipeline.set_hyperparameters(hyperparameters) + + block_1.set_hyperparameters.assert_not_called() + block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'}) + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test__get_block_args(self): - pass + input_names = { + 'a_block': { + 'arg_3': 'arg_3_alt' + } + } + pipeline = MLPipeline(['a_primitive'], input_names=input_names) - def test__get_outputs(self): - pass + block_args = [ + { + 'name': 'arg_1', + }, + { + 'name': 'arg_2', + 'default': 'arg_2_value' + }, + { + 'name': 'arg_3', + }, + { + 'name': 'arg_4', + 'required': False + }, + ] + context = { + 'arg_1': 'arg_1_value', + 'arg_3_alt': 'arg_3_value' + } + + args = pipeline._get_block_args('a_block', block_args, context) + + expected = { + 'arg_1': 'arg_1_value', + 'arg_3': 'arg_3_value', + } + assert args == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test__get_outputs_no_outputs(self): + self_ = MagicMock(autospec=MLPipeline) + + self_._last_block_name = 'last_block' + self_._get_block_outputs.return_value = ['some', 'outputs'] + + pipeline = dict() + outputs = None + returned = MLPipeline._get_outputs(self_, pipeline, outputs) + + expected = { + 'default': ['some', 'outputs'] + } + assert returned == expected + + self_._get_block_outputs.assert_called_once_with('last_block') + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test__get_outputs_defaults(self): + self_ = MagicMock(autospec=MLPipeline) + + pipeline = dict() + outputs = { + 'default': ['some', 'outputs'] + } + returned = MLPipeline._get_outputs(self_, pipeline, outputs) + + expected = { + 'default': ['some', 'outputs'] + } + assert returned == expected + self_._get_block_outputs.assert_not_called() + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test__get_outputs_additional(self): + self_ = MagicMock(autospec=MLPipeline) + + pipeline = { + 'outputs': { + 'default': ['some', 'outputs'], + 'additional': ['other', 'outputs'] + } + } + outputs = None + returned = MLPipeline._get_outputs(self_, pipeline, outputs) + + expected = { + 'default': ['some', 'outputs'], + 'additional': ['other', 'outputs'] + } + assert returned == expected + self_._get_block_outputs.assert_not_called() + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_outputs_str_named(self): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_variable', + 'type': 'a_type', + } + ], + 'debug': [ + { + 'name': 'another_name', + 'variable': 'another_variable', + } + ] + } + pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs) + + returned = pipeline.get_outputs('debug') + + expected = [ + { + 'name': 'another_name', + 'variable': 'another_variable', + } + ] + assert returned == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_outputs_str_variable(self): + pipeline = MLPipeline(['a_primitive', 'another_primitive']) + pipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'output', + 'type': 'whatever' + } + ] + + returned = pipeline.get_outputs('a_primitive#1.output') + + expected = [ + { + 'name': 'output', + 'type': 'whatever', + 'variable': 'a_primitive#1.output' + } + ] + assert returned == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_outputs_str_block(self): + pipeline = MLPipeline(['a_primitive', 'another_primitive']) + + returned = pipeline.get_outputs('a_primitive#1') + + expected = [ + { + 'name': 'a_primitive#1', + 'variable': 'a_primitive#1', + } + ] + assert returned == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_outputs_int(self): + pipeline = MLPipeline(['a_primitive', 'another_primitive']) + + returned = pipeline.get_outputs(-1) + + expected = [ + { + 'name': 'another_primitive#1', + 'variable': 'another_primitive#1', + } + ] + assert returned == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_outputs_combination(self): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_variable', + 'type': 'a_type', + } + ], + 'debug': [ + { + 'name': 'another_name', + 'variable': 'another_variable', + } + ] + } + pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs) + pipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'output', + 'type': 'whatever' + } + ] + pipeline.blocks['another_primitive#1'].produce_output = [ + { + 'name': 'something', + } + ] + + returned = pipeline.get_outputs(['default', 'debug', -1, 'a_primitive#1.output']) + + expected = [ + { + 'name': 'a_name', + 'variable': 'a_variable', + 'type': 'a_type' + }, + { + 'name': 'another_name', + 'variable': 'another_variable', + }, + { + 'name': 'another_primitive#1', + 'variable': 'another_primitive#1', + }, + { + 'name': 'output', + 'type': 'whatever', + 'variable': 'a_primitive#1.output' + } + ] + assert returned == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_outputs_invalid(self): + pipeline = MLPipeline(['a_primitive']) + + pipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'output', + 'type': 'whatever' + } + ] + + with pytest.raises(ValueError): + pipeline.get_outputs('a_primitive#1.invalid') + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_output_names(self): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_variable', + 'type': 'a_type', + } + ] + } + pipeline = MLPipeline(['a_primitive'], outputs=outputs) + + names = pipeline.get_output_names() + + assert names == ['a_name'] + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_output_variables(self): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_variable', + 'type': 'a_type', + } + ] + } + pipeline = MLPipeline(['a_primitive'], outputs=outputs) + + names = pipeline.get_output_variables() + + assert names == ['a_variable'] + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test__get_block_variables_is_dict(self): + pipeline = MLPipeline(['a_primitive']) + pipeline.blocks['a_primitive#1'].produce_outputs = [ + { + 'name': 'output', + 'type': 'whatever' + } + ] + + outputs = pipeline._get_block_variables( + 'a_primitive#1', + 'produce_outputs', + {'output': 'name_output'} + ) + + expected = { + 'name_output': { + 'name': 'output', + 'type': 'whatever', + } + } + assert outputs == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test__get_block_variables_is_str(self): + pipeline = MLPipeline(['a_primitive']) + pipeline.blocks['a_primitive#1'].produce_outputs = 'get_produce_outputs' + pipeline.blocks['a_primitive#1'].instance.get_produce_outputs.return_value = [ + { + 'name': 'output_from_function', + 'type': 'test' + } + + ] + + outputs = pipeline._get_block_variables( + 'a_primitive#1', + 'produce_outputs', + {'output': 'name_output'} + ) + + expected = { + 'output_from_function': { + 'name': 'output_from_function', + 'type': 'test', + } + } + assert outputs == expected + pipeline.blocks['a_primitive#1'].instance.get_produce_outputs.assert_called_once_with() + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_inputs_fit(self): + pipeline = MLPipeline(['a_primitive', 'another_primitive']) + pipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'output', + 'type': 'another_whatever' + } + ] + pipeline.blocks['another_primitive#1'].produce_args = [ + { + 'name': 'output', + 'type': 'another_whatever' + }, + { + 'name': 'another_input', + 'type': 'another_whatever' + } + ] + + inputs = pipeline.get_inputs() + + expected = { + 'input': { + 'name': 'input', + 'type': 'whatever', + }, + 'fit_input': { + 'name': 'fit_input', + 'type': 'whatever', + }, + 'another_input': { + 'name': 'another_input', + 'type': 'another_whatever', + } + } + assert inputs == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_inputs_no_fit(self): + pipeline = MLPipeline(['a_primitive', 'another_primitive']) + pipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'output', + 'type': 'another_whatever' + } + ] + pipeline.blocks['another_primitive#1'].produce_args = [ + { + 'name': 'output', + 'type': 'another_whatever' + }, + { + 'name': 'another_input', + 'type': 'another_whatever' + } + ] + + inputs = pipeline.get_inputs(fit=False) + + expected = { + 'input': { + 'name': 'input', + 'type': 'whatever', + }, + 'another_input': { + 'name': 'another_input', + 'type': 'another_whatever', + } + } + assert inputs == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_fit_args(self): + pipeline = MLPipeline(['a_primitive']) + pipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'output', + 'type': 'another_whatever' + } + ] + + outputs = pipeline.get_fit_args() + + expected = [ + { + 'name': 'input', + 'type': 'whatever' + }, + { + 'name': 'fit_input', + 'type': 'whatever', + } + ] + assert outputs == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_predict_args(self): + pipeline = MLPipeline(['a_primitive']) + pipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'output', + 'type': 'another_whatever' + } + ] + outputs = pipeline.get_predict_args() + + expected = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + assert outputs == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_fit_pending_all_primitives(self): + block_1 = get_mlblock_mock() + block_2 = get_mlblock_mock() + blocks = OrderedDict(( + ('a.primitive.Name#1', block_1), + ('a.primitive.Name#2', block_2), + )) + + self_ = MagicMock(autospec=MLPipeline) + self_.blocks = blocks + self_._last_fit_block = 'a.primitive.Name#2' + + MLPipeline.fit(self_) + + expected = [ + call('a.primitive.Name#1'), + call('a.primitive.Name#2') + ] + self_._fit_block.call_args_list = expected + + expected = [ + call('a.primitive.Name#1'), + ] + self_._produce_block.call_args_list = expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_fit_pending_one_primitive(self): + block_1 = get_mlblock_mock() + block_2 = get_mlblock_mock() + blocks = OrderedDict(( + ('a.primitive.Name#1', block_1), + ('a.primitive.Name#2', block_2), + )) + + self_ = MagicMock(autospec=MLPipeline) + self_.blocks = blocks + self_._last_fit_block = 'a.primitive.Name#1' + + MLPipeline.fit(self_) + + expected = [ + call('a.primitive.Name#1'), + ] + self_._fit_block.call_args_list = expected + + assert not self_._produce_block.called + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_fit_no_debug(self): + mlpipeline = MLPipeline(['a_primitive']) + mlpipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + + returned = mlpipeline.fit(debug=False) + + assert returned is None + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_fit_debug_bool(self): + mlpipeline = MLPipeline(['a_primitive']) + mlpipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + + expected_return = dict() + expected_return['debug'] = 'tmio' + expected_return['fit'] = { + 'a_primitive#1': { + 'time': 0, + 'input': { + 'whatever' + }, + 'memory': 0, + } + } + + returned = mlpipeline.fit(debug=True) + + assert isinstance(returned, dict) + assert set(returned.keys()) == set(expected_return.keys()) # fit / produce + assert set(returned['fit'].keys()) == set(expected_return['fit'].keys()) # block name + + for block_name, dictionary in expected_return['fit'].items(): + assert set(returned['fit'][block_name].keys()) == set(dictionary.keys()) + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_fit_debug_str(self): + mlpipeline = MLPipeline(['a_primitive']) + mlpipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + + expected_return = dict() + expected_return['debug'] = 'tm' + expected_return['fit'] = { + 'a_primitive#1': { + 'time': 0, + 'memory': 0, + } + } + + returned = mlpipeline.fit(debug='tm') + + assert isinstance(returned, dict) + assert set(returned.keys()) == set(expected_return.keys()) # fit / produce + assert set(returned['fit'].keys()) == set(expected_return['fit'].keys()) # block name + + for block_name, dictionary in expected_return['fit'].items(): + assert set(returned['fit'][block_name].keys()) == set(dictionary.keys()) + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_fit_produce_debug(self): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_primitive#1.a_variable', + 'type': 'a_type', + } + ] + } + mlpipeline = MLPipeline(['a_primitive'], outputs=outputs) + mlpipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + + mlpipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + + mlpipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'a_name', + 'type': 'a_type' + } + ] + + expected_return = dict() + expected_return['debug'] = 'tmio' + expected_return['fit'] = { + 'a_primitive#1': { + 'time': 0, + 'input': { + 'whatever' + }, + 'memory': 0, + } + } + expected_return['produce'] = { + 'a_primitive#1': { + 'time': 0, + 'input': { + 'whatever' + }, + 'output': { + 'whatever' + }, + 'memory': 0, + } + } + + returned, debug_returned = mlpipeline.fit(output_='default', debug=True) + + assert len([returned]) == len(outputs['default']) + assert isinstance(debug_returned, dict) + assert set(debug_returned.keys()) == set(expected_return.keys()) # fit / produce + assert set(debug_returned['fit'].keys()) == set(expected_return['fit'].keys()) + assert set(debug_returned['produce'].keys()) == set(expected_return['produce'].keys()) + + for block_name, dictionary in expected_return['fit'].items(): + assert set(debug_returned['fit'][block_name].keys()) == set(dictionary.keys()) + + for block_name, dictionary in expected_return['produce'].items(): + assert set(debug_returned['produce'][block_name].keys()) == set(dictionary.keys()) + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_fit_produce_debug_str(self): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_primitive#1.a_variable', + 'type': 'a_type', + } + ] + } + mlpipeline = MLPipeline(['a_primitive'], outputs=outputs) + mlpipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + + mlpipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + + mlpipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'a_name', + 'type': 'a_type' + } + ] + + expected_return = dict() + expected_return['debug'] = 'tm' + expected_return['fit'] = { + 'a_primitive#1': { + 'time': 0, + 'memory': 0, + } + } + expected_return['produce'] = { + 'a_primitive#1': { + 'time': 0, + 'memory': 0, + } + } + + returned, debug_returned = mlpipeline.fit(output_='default', debug='tm') + + assert len([returned]) == len(outputs['default']) + assert isinstance(debug_returned, dict) + assert set(debug_returned.keys()) == set(expected_return.keys()) # fit / produce + assert set(debug_returned['fit'].keys()) == set(expected_return['fit'].keys()) + assert set(debug_returned['produce'].keys()) == set(expected_return['produce'].keys()) + + for block_name, dictionary in expected_return['fit'].items(): + assert set(debug_returned['fit'][block_name].keys()) == set(dictionary.keys()) + + for block_name, dictionary in expected_return['produce'].items(): + assert set(debug_returned['produce'][block_name].keys()) == set(dictionary.keys()) + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_predict_no_debug(self): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_primitive#1.a_variable', + 'type': 'a_type', + }, + { + 'name': 'b_name', + 'variable': 'a_primitive#1.b_variable', + 'type': 'b_type', + }, + ] + } + mlpipeline = MLPipeline(['a_primitive'], outputs=outputs) + mlpipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + + mlpipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'a_name', + 'type': 'a_type' + }, + { + 'name': 'b_name', + 'type': 'b_type' + } + ] + + returned = mlpipeline.predict(debug=False) + assert len(returned) == len(outputs['default']) + for returned_output, expected_output in zip(returned, outputs['default']): + assert returned_output == expected_output['variable'] + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_predict_debug(self): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_primitive#1.a_variable', + 'type': 'a_type', + } + ] + } + mlpipeline = MLPipeline(['a_primitive'], outputs=outputs) + mlpipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + + mlpipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'a_name', + 'type': 'a_type' + } + ] + + expected_return = dict() + expected_return = { + 'a_primitive#1': { + 'time': 0, + 'input': { + 'whatever' + }, + 'output': { + 'whatever' + }, + 'memory': 0 + } + } + + returned, debug_returned = mlpipeline.predict(debug=True) + debug_returned = debug_returned['produce'] + + assert len([returned]) == len(outputs['default']) + assert isinstance(debug_returned, dict) + assert set(debug_returned.keys()) == set(expected_return.keys()) + + for block_name, dictionary in expected_return.items(): + assert set(debug_returned[block_name].keys()) == set(dictionary.keys()) + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_diagram_simple(self): + f = open('tests/data/diagrams/diagram_simple.txt', 'r') + expected = f.read()[:-1] + f.close() + + output = [ + { + 'name': 'output_variable', + 'type': 'another_whatever', + 'variable': 'a_primitive#1.output_variable' + } + ] + + pipeline = MLPipeline(['a_primitive'], outputs={'default': output}) + pipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input_variable', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].produce_output = output + + assert str(pipeline.get_diagram()).strip() == expected.strip() + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_diagram_fit(self): + f = open('tests/data/diagrams/diagram_fit.txt', 'r') + expected = f.read()[:-1] + f.close() + + output = [ + { + 'name': 'output_variable', + 'type': 'another_whatever', + 'variable': 'a_primitive#1.output_variable' + } + ] + + pipeline = MLPipeline(['a_primitive'], outputs={'default': output}) + pipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input_variable', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'input_variable', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].produce_output = output + + assert str(pipeline.get_diagram()).strip() == expected.strip() + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_diagram_multiple_blocks(self): + f = open('tests/data/diagrams/diagram_multiple_blocks.txt', 'r') + expected = f.read()[:-1] + f.close() + + first_output = [ + { + 'name': 'output_variable_a', + 'type': 'another_whatever', + 'variable': 'a_primitive#1.output_variable_a' + } + ] + second_output = [ + { + 'name': 'output_variable_b', + 'type': 'another_whatever', + 'variable': 'b_primitive#1.output_variable_b' + } + ] + + pipeline = MLPipeline(['a_primitive', 'b_primitive'], outputs={'default': second_output}) + pipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input_variable', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].produce_output = first_output + pipeline.blocks['b_primitive#1'].produce_args = first_output + pipeline.blocks['b_primitive#1'].produce_output = second_output + + assert str(pipeline.get_diagram()).strip() == expected.strip() def test_fit(self): pass diff --git a/tests/test_primitives.py b/tests/test_primitives.py deleted file mode 100644 index 65906406..00000000 --- a/tests/test_primitives.py +++ /dev/null @@ -1,65 +0,0 @@ -# -*- coding: utf-8 -*- - -import json -import os -import tempfile -import uuid -from unittest.mock import patch - -import pytest - -from mlblocks import primitives - - -@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) -def test_add_primitives_path_do_nothing(): - primitives.add_primitives_path('a') - - assert primitives._PRIMITIVES_PATHS == ['a', 'b'] - - -@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) -def test_add_primitives_path_exception(): - invalid_path = str(uuid.uuid4()) - - with pytest.raises(ValueError): - primitives.add_primitives_path(invalid_path) - - -@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) -def test_add_primitives_path(): - primitives.add_primitives_path('tests') - - expected_path = os.path.abspath('tests') - - assert primitives._PRIMITIVES_PATHS == [expected_path, 'a', 'b'] - - -@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) -def test_get_primitives_paths(): - paths = primitives.get_primitives_paths() - - assert paths == ['a', 'b'] - - -@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) -def test_load_primitive_value_error(): - with pytest.raises(ValueError): - primitives.load_primitive('invalid.primitive') - - -def test_load_primitive_success(): - primitive = { - 'name': 'temp.primitive', - 'primitive': 'temp.primitive' - } - - with tempfile.TemporaryDirectory() as tempdir: - primitives.add_primitives_path(tempdir) - primitive_path = os.path.join(tempdir, 'temp.primitive.json') - with open(primitive_path, 'w') as primitive_file: - json.dump(primitive, primitive_file, indent=4) - - loaded = primitives.load_primitive('temp.primitive') - - assert primitive == loaded diff --git a/tox.ini b/tox.ini index 76529366..cdaadc29 100644 --- a/tox.ini +++ b/tox.ini @@ -1,31 +1,27 @@ [tox] -envlist = py35, py36, lint, docs - +envlist = py3{6,7,8,9,10,11}, test-devel [travis] python = - 3.6: py36, lint, docs - 3.5: py35 - + 3.13: py313 + 3.12: py312 + 3.11: py311 + 3.10: py310 + 3.9: py39 + 3.8: py38, test-devel [testenv] passenv = CI TRAVIS TRAVIS_* -setenv = - PYTHONPATH = {toxinidir} +allowlist_externals = rm +skipsdist = false +skip_install = false extras = test commands = - /usr/bin/env python -m pytest --cov=mlblocks - - -[testenv:lint] -skipsdist = true -extras = dev -commands = - /usr/bin/env make lint - + /usr/bin/env make test + rm -r {envdir} -[testenv:docs] -skipsdist = true +[testenv:test-devel] extras = dev commands = - /usr/bin/env make docs + /usr/bin/env make test-devel + rm -r {envdir}