From 396114b6140a12ead0d52993c8e42d61d2e4dd13 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 7 Jan 2019 20:13:41 +0100
Subject: [PATCH 001/160] Discover primitives using entry_points
---
mlblocks/primitives.py | 33 +++++++++++++++++++++++++--------
1 file changed, 25 insertions(+), 8 deletions(-)
diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py
index 337116e7..8902b672 100644
--- a/mlblocks/primitives.py
+++ b/mlblocks/primitives.py
@@ -9,11 +9,15 @@
import json
import os
+import pkg_resources
import sys
+_PRIMITIVES_FOLDER_NAME = 'mlprimitives'
+_OLD_PRIMITIVES_FOLDER_NAME = 'mlblocks_primitives'
_PRIMITIVES_PATHS = [
- os.path.join(os.getcwd(), 'mlblocks_primitives'),
- os.path.join(sys.prefix, 'mlblocks_primitives'),
+ os.path.join(os.getcwd(), _PRIMITIVES_FOLDER_NAME),
+ os.path.join(os.getcwd(), _OLD_PRIMITIVES_FOLDER_NAME), # legacy
+ os.path.join(sys.prefix, _OLD_PRIMITIVES_FOLDER_NAME), # legacy
]
@@ -45,7 +49,13 @@ def get_primitives_paths():
list:
The list of folders.
"""
- return _PRIMITIVES_PATHS
+
+ primitives_paths = list()
+ for entry_point in pkg_resources.iter_entry_points(_PRIMITIVES_FOLDER_NAME):
+ path = pkg_resources.resource_filename(entry_point.name, entry_point.module_name)
+ primitives_paths.append(path)
+
+ return _PRIMITIVES_PATHS + primitives_paths
def load_primitive(name):
@@ -69,10 +79,17 @@ def load_primitive(name):
found.
"""
- for base_path in _PRIMITIVES_PATHS:
- json_path = os.path.join(base_path, name + '.json')
- if os.path.isfile(json_path):
- with open(json_path, 'r') as json_file:
- return json.load(json_file)
+ for base_path in get_primitives_paths():
+ parts = name.split('.')
+ number_of_parts = len(parts)
+
+ for folder_parts in range(number_of_parts):
+ folder = os.path.join(base_path, *parts[:folder_parts])
+ filename = '.'.join(parts[folder_parts:]) + '.json'
+ json_path = os.path.join(folder, filename)
+
+ if os.path.isfile(json_path):
+ with open(json_path, 'r') as json_file:
+ return json.load(json_file)
raise ValueError("Unknown primitive: {}".format(name))
From da04277d5268b194bd33707735a14b79d1cf1239 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 8 Jan 2019 14:36:27 +0100
Subject: [PATCH 002/160] Fix import order and add tests
---
mlblocks/primitives.py | 6 ++++--
tests/test_primitives.py | 30 +++++++++++++++++++++++++++++-
2 files changed, 33 insertions(+), 3 deletions(-)
diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py
index 8902b672..d4825bf6 100644
--- a/mlblocks/primitives.py
+++ b/mlblocks/primitives.py
@@ -9,9 +9,10 @@
import json
import os
-import pkg_resources
import sys
+import pkg_resources
+
_PRIMITIVES_FOLDER_NAME = 'mlprimitives'
_OLD_PRIMITIVES_FOLDER_NAME = 'mlblocks_primitives'
_PRIMITIVES_PATHS = [
@@ -52,7 +53,8 @@ def get_primitives_paths():
primitives_paths = list()
for entry_point in pkg_resources.iter_entry_points(_PRIMITIVES_FOLDER_NAME):
- path = pkg_resources.resource_filename(entry_point.name, entry_point.module_name)
+ module_path = os.path.join(*entry_point.module_name.split('.'))
+ path = pkg_resources.resource_filename(entry_point.name, module_path)
primitives_paths.append(path)
return _PRIMITIVES_PATHS + primitives_paths
diff --git a/tests/test_primitives.py b/tests/test_primitives.py
index 65906406..990c4da5 100644
--- a/tests/test_primitives.py
+++ b/tests/test_primitives.py
@@ -7,6 +7,7 @@
from unittest.mock import patch
import pytest
+from pkg_resources import EntryPoint
from mlblocks import primitives
@@ -36,12 +37,39 @@ def test_add_primitives_path():
@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
-def test_get_primitives_paths():
+@patch('mlblocks.primitives._PRIMITIVES_FOLDER_NAME', new='fake_name')
+def test_get_primitives_paths_no_entry_points():
paths = primitives.get_primitives_paths()
assert paths == ['a', 'b']
+@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
+@patch('mlblocks.primitives.pkg_resources.iter_entry_points')
+def test_get_primitives_paths_entry_points(iep_mock):
+ # setup
+ iep_mock.return_value = [
+ EntryPoint('mlblocks', 'primitives.jsons')
+ ]
+
+ # run
+ paths = primitives.get_primitives_paths()
+
+ # assert
+ expected = [
+ 'a',
+ 'b',
+ os.path.join(
+ os.path.dirname(primitives.__file__),
+ 'primitives',
+ 'jsons'
+ )
+ ]
+ assert paths == expected
+
+ iep_mock.assert_called_once_with('mlprimitives')
+
+
@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
def test_load_primitive_value_error():
with pytest.raises(ValueError):
From f551d339217554472fb5cecc162d5ab31f0d10d6 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 8 Jan 2019 19:02:43 +0100
Subject: [PATCH 003/160] Change slightly the way the entry points are used and
add docs
---
.gitignore | 1 +
docs/advanced_usage/adding_primitives.rst | 34 ++++++++-
docs/api/mlblocks.primitives.rst | 5 ++
docs/index.rst | 1 +
docs/pipeline.json | 91 -----------------------
mlblocks/primitives.py | 33 +++++---
tests/__init__.py | 0
tests/test_primitives.py | 30 +++++---
8 files changed, 83 insertions(+), 112 deletions(-)
create mode 100644 docs/api/mlblocks.primitives.rst
delete mode 100644 docs/pipeline.json
create mode 100644 tests/__init__.py
diff --git a/.gitignore b/.gitignore
index cbc1f8c1..011ff452 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,6 +64,7 @@ instance/
# Sphinx documentation
docs/_build/
+docs/pipeline.json
# PyBuilder
target/
diff --git a/docs/advanced_usage/adding_primitives.rst b/docs/advanced_usage/adding_primitives.rst
index fc2e81b9..e3d4b964 100644
--- a/docs/advanced_usage/adding_primitives.rst
+++ b/docs/advanced_usage/adding_primitives.rst
@@ -29,7 +29,7 @@ by writing the corresponding `JSON annotation
.. _MLPrimitives integrated primitives: https://github.com/HDI-Project/MLPrimitives/tree/master/mlblocks_primitives
-.. note:: If you integrate new primitives for MLBlocks, please consider contributing them to the
+.. note:: If you create new primitives for MLBlocks, please consider contributing them to the
**MLPrimitives** project!
The first thing to do when adding a new primitive is making sure that it complies with the
@@ -58,8 +58,8 @@ place known to **MLBlocks**.
**MLBlocks** looks for primitives in the following folders, in this order:
1. Any folder specified by the user, starting by the latest one.
-2. A folder named `mlblocks_primitives` in the current working directory.
-3. A folder named `mlblocks_primitives` in the `system prefix`_.
+2. A folder named ``mlblocks_primitives`` or ``mlprimitives`` in the current working directory.
+3. A folder named ``mlblocks_primitives`` or ``mlprimitives`` in the `system prefix`_.
.. _system prefix: https://docs.python.org/3/library/sys.html#sys.prefix
@@ -80,3 +80,31 @@ However, sometimes you will want to add a custom directory.
This can be easily done by using the `mlblocks.add_primitives_path`_ method.
.. _mlblocks.add_primitives_path: ../api_reference.html#mlblocks.add_primitives_path
+
+Developing a Primitives Library
+-------------------------------
+
+Another option to add multiple libraries is creating a primitives library, such as
+`MLPrimitives`_.
+
+In order to make **MLBLocks** able to find the primitives defined in such a library,
+all you need to do is setting up an `Entry Point`_ in your `setup.py` script with the
+following specification:
+
+1. It has to be published under the name ``mlprimitives``.
+2. It has to be named exactly ``jsons_path``.
+3. It has to point at a variable that contains the path to the JSONS folder.
+
+An example of such an entry point would be::
+
+ entry_points = {
+ 'mlprimitives': [
+ 'jsons_path=some_module:SOME_VARIABLE'
+ ]
+ }
+
+where the module `some_module` contains a variable such as::
+
+ SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
+
+.. _Entry Point: https://packaging.python.org/specifications/entry-points/
diff --git a/docs/api/mlblocks.primitives.rst b/docs/api/mlblocks.primitives.rst
new file mode 100644
index 00000000..d625c774
--- /dev/null
+++ b/docs/api/mlblocks.primitives.rst
@@ -0,0 +1,5 @@
+mlblocks.primitives
+===================
+
+.. automodule:: mlblocks.primitives
+ :members:
diff --git a/docs/index.rst b/docs/index.rst
index 28a3f0bb..2bb4c5a9 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -74,6 +74,7 @@ integrate with deep learning libraries.
api/mlblocks
api/mlblocks.datasets
+ api/mlblocks.primitives
.. toctree::
:caption: Resources
diff --git a/docs/pipeline.json b/docs/pipeline.json
deleted file mode 100644
index c09d763c..00000000
--- a/docs/pipeline.json
+++ /dev/null
@@ -1,91 +0,0 @@
-{
- "primitives": [
- "sklearn.preprocessing.StandardScaler",
- "sklearn.ensemble.RandomForestClassifier"
- ],
- "init_params": {
- "sklearn.preprocessing.StandardScaler": {
- "with_mean": false
- },
- "sklearn.ensemble.RandomForestClassifier": {
- "n_estimators": 100
- }
- },
- "input_names": {},
- "output_names": {},
- "hyperparameters": {
- "sklearn.preprocessing.StandardScaler#1": {
- "with_mean": false,
- "with_std": true
- },
- "sklearn.ensemble.RandomForestClassifier#1": {
- "n_jobs": -1,
- "n_estimators": 100,
- "criterion": "entropy",
- "max_features": null,
- "max_depth": 10,
- "min_samples_split": 0.1,
- "min_samples_leaf": 0.1,
- "class_weight": null
- }
- },
- "tunable_hyperparameters": {
- "sklearn.preprocessing.StandardScaler#1": {
- "with_std": {
- "type": "bool",
- "default": true
- }
- },
- "sklearn.ensemble.RandomForestClassifier#1": {
- "criterion": {
- "type": "str",
- "default": "entropy",
- "values": [
- "entropy",
- "gini"
- ]
- },
- "max_features": {
- "type": "str",
- "default": null,
- "range": [
- null,
- "auto",
- "log2"
- ]
- },
- "max_depth": {
- "type": "int",
- "default": 10,
- "range": [
- 1,
- 30
- ]
- },
- "min_samples_split": {
- "type": "float",
- "default": 0.1,
- "range": [
- 0.0001,
- 0.5
- ]
- },
- "min_samples_leaf": {
- "type": "float",
- "default": 0.1,
- "range": [
- 0.0001,
- 0.5
- ]
- },
- "class_weight": {
- "type": "str",
- "default": null,
- "range": [
- null,
- "balanced"
- ]
- }
- }
- }
-}
\ No newline at end of file
diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py
index d4825bf6..8aaaa60f 100644
--- a/mlblocks/primitives.py
+++ b/mlblocks/primitives.py
@@ -13,12 +13,11 @@
import pkg_resources
-_PRIMITIVES_FOLDER_NAME = 'mlprimitives'
-_OLD_PRIMITIVES_FOLDER_NAME = 'mlblocks_primitives'
_PRIMITIVES_PATHS = [
- os.path.join(os.getcwd(), _PRIMITIVES_FOLDER_NAME),
- os.path.join(os.getcwd(), _OLD_PRIMITIVES_FOLDER_NAME), # legacy
- os.path.join(sys.prefix, _OLD_PRIMITIVES_FOLDER_NAME), # legacy
+ os.path.join(os.getcwd(), 'mlprimitives'),
+ os.path.join(sys.prefix, 'mlprimitives'),
+ os.path.join(os.getcwd(), 'mlblocks_primitives'), # legacy
+ os.path.join(sys.prefix, 'mlblocks_primitives'), # legacy
]
@@ -46,16 +45,32 @@ def add_primitives_path(path):
def get_primitives_paths():
"""Get the list of folders where the primitives will be looked for.
+ This list will include the value of any `entry_point` named `jsons_path` published under
+ the name `mlprimitives`.
+
+ An example of such an entry point would be::
+
+ entry_points = {
+ 'mlprimitives': [
+ 'jsons_path=some_module:SOME_VARIABLE'
+ ]
+ }
+
+ where the module `some_module` contains a variable such as::
+
+ SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
+
Returns:
list:
The list of folders.
"""
primitives_paths = list()
- for entry_point in pkg_resources.iter_entry_points(_PRIMITIVES_FOLDER_NAME):
- module_path = os.path.join(*entry_point.module_name.split('.'))
- path = pkg_resources.resource_filename(entry_point.name, module_path)
- primitives_paths.append(path)
+ entry_points = pkg_resources.iter_entry_points('mlprimitives')
+ for entry_point in entry_points:
+ if entry_point.name == 'jsons_path':
+ path = entry_point.load()
+ primitives_paths.append(path)
return _PRIMITIVES_PATHS + primitives_paths
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_primitives.py b/tests/test_primitives.py
index 990c4da5..1afd17b6 100644
--- a/tests/test_primitives.py
+++ b/tests/test_primitives.py
@@ -7,10 +7,12 @@
from unittest.mock import patch
import pytest
-from pkg_resources import EntryPoint
+from pkg_resources import Distribution, EntryPoint
from mlblocks import primitives
+FAKE_MLPRIMITIVES_PATH = 'this/is/a/fake'
+
@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
def test_add_primitives_path_do_nothing():
@@ -37,19 +39,33 @@ def test_add_primitives_path():
@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
-@patch('mlblocks.primitives._PRIMITIVES_FOLDER_NAME', new='fake_name')
-def test_get_primitives_paths_no_entry_points():
+@patch('mlblocks.primitives.pkg_resources.iter_entry_points')
+def test_get_primitives_paths_no_entry_points(iep_mock):
+ # setup
+ iep_mock.return_value == []
+
+ # run
paths = primitives.get_primitives_paths()
+ # assert
assert paths == ['a', 'b']
+ iep_mock.assert_called_once_with('mlprimitives')
@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
@patch('mlblocks.primitives.pkg_resources.iter_entry_points')
def test_get_primitives_paths_entry_points(iep_mock):
# setup
+ something_else_ep = EntryPoint('something_else', 'mlblocks.__version__')
+ jsons_path_ep = EntryPoint(
+ 'jsons_path',
+ 'tests.test_primitives',
+ attrs=['FAKE_MLPRIMITIVES_PATH'],
+ dist=Distribution()
+ )
iep_mock.return_value = [
- EntryPoint('mlblocks', 'primitives.jsons')
+ something_else_ep,
+ jsons_path_ep
]
# run
@@ -59,11 +75,7 @@ def test_get_primitives_paths_entry_points(iep_mock):
expected = [
'a',
'b',
- os.path.join(
- os.path.dirname(primitives.__file__),
- 'primitives',
- 'jsons'
- )
+ 'this/is/a/fake'
]
assert paths == expected
From 74da0e2249cb30100229c64cd0a83a4543daf12e Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 9 Jan 2019 16:06:47 +0100
Subject: [PATCH 004/160] Add logging statements
---
mlblocks/__init__.py | 6 +++---
mlblocks/datasets.py | 9 +++++++++
mlblocks/mlblock.py | 6 +++++-
mlblocks/mlpipeline.py | 38 ++++++++++++++++++++++++--------------
mlblocks/primitives.py | 5 +++++
5 files changed, 46 insertions(+), 18 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index cfc0ef6a..43079986 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -10,9 +10,9 @@
* Documentation: https://HDI-Project.github.io/MLBlocks
"""
-from mlblocks.mlblock import MLBlock # noqa
-from mlblocks.mlpipeline import MLPipeline # noqa
-from mlblocks.primitives import add_primitives_path, get_primitives_paths, load_primitive # noqa
+from mlblocks.mlblock import MLBlock
+from mlblocks.mlpipeline import MLPipeline
+from mlblocks.primitives import add_primitives_path, get_primitives_paths, load_primitive
__author__ = 'MIT Data To AI Lab'
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py
index fba968e8..b5ed6b46 100644
--- a/mlblocks/datasets.py
+++ b/mlblocks/datasets.py
@@ -40,6 +40,7 @@
"""
import io
+import logging
import os
import tarfile
import urllib
@@ -52,6 +53,8 @@
from sklearn.metrics import accuracy_score, normalized_mutual_info_score, r2_score
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
+LOGGER = logging.getLogger(__name__)
+
INPUT_SHAPE = [224, 224, 3]
DATA_PATH = os.path.join(
@@ -183,9 +186,12 @@ def get_splits(self, n_splits=1):
def _download(dataset_name, dataset_path):
url = DATA_URL.format(dataset_name)
+
+ LOGGER.debug('Downloading dataset %s from %s', dataset_name, url)
response = urllib.request.urlopen(url)
bytes_io = io.BytesIO(response.read())
+ LOGGER.debug('Extracting dataset into %s', DATA_PATH)
with tarfile.open(fileobj=bytes_io, mode='r:gz') as tf:
tf.extractall(DATA_PATH)
@@ -202,6 +208,7 @@ def _load(dataset_name):
def _load_images(image_dir, filenames):
+ LOGGER.debug('Loading %s images from %s', len(filenames), image_dir)
images = []
for filename in filenames:
filename = os.path.join(image_dir, filename)
@@ -217,6 +224,8 @@ def _load_images(image_dir, filenames):
def _load_csv(dataset_path, name, set_index=False):
csv_path = os.path.join(dataset_path, name + '.csv')
+
+ LOGGER.debug('Loading csv %s', csv_path)
df = pd.read_csv(csv_path)
if set_index:
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 9b6ec0d0..04a4bf55 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -3,9 +3,12 @@
"""Package where the MLBlock class is defined."""
import importlib
+import logging
from mlblocks.primitives import load_primitive
+LOGGER = logging.getLogger(__name__)
+
def import_object(object_name):
"""Import an object from its Fully Qualified Name."""
@@ -83,7 +86,7 @@ def _extract_params(self, kwargs, hyperparameters):
value = param['default']
else:
- raise TypeError("Required argument '{}' not found".format(name))
+ raise TypeError("{} required argument '{}' not found".format(self.name, name))
init_params[name] = value
@@ -193,6 +196,7 @@ def set_hyperparameters(self, hyperparameters):
self._hyperparameters.update(hyperparameters)
if self._class:
+ LOGGER.debug('Creating a new primitive instance for %s', self.name)
self.instance = self.primitive(**self._hyperparameters)
def fit(self, **kwargs):
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 4bad5d1f..058737ee 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -215,19 +215,25 @@ def fit(self, X=None, y=None, **kwargs):
last_block_name = list(self.blocks.keys())[-1]
for block_name, block in self.blocks.items():
- fit_args = self._get_block_args(block_name, block.fit_args, context)
-
LOGGER.debug("Fitting block %s", block_name)
- block.fit(**fit_args)
+ try:
+ fit_args = self._get_block_args(block_name, block.fit_args, context)
+ block.fit(**fit_args)
+ except Exception:
+ LOGGER.exception("Exception caught fitting MLBlock %s", block_name)
+ raise
if block_name != last_block_name:
- produce_args = self._get_block_args(block_name, block.produce_args, context)
-
LOGGER.debug("Producing block %s", block_name)
- outputs = block.produce(**produce_args)
+ try:
+ produce_args = self._get_block_args(block_name, block.produce_args, context)
+ outputs = block.produce(**produce_args)
- output_dict = self._get_outputs(block_name, outputs, block.produce_output)
- context.update(output_dict)
+ output_dict = self._get_outputs(block_name, outputs, block.produce_output)
+ context.update(output_dict)
+ except Exception:
+ LOGGER.exception("Exception caught producing MLBlock %s", block_name)
+ raise
def predict(self, X=None, **kwargs):
"""Produce predictions using the blocks of this pipeline.
@@ -252,14 +258,18 @@ def predict(self, X=None, **kwargs):
last_block_name = list(self.blocks.keys())[-1]
for block_name, block in self.blocks.items():
- produce_args = self._get_block_args(block_name, block.produce_args, context)
-
LOGGER.debug("Producing block %s", block_name)
- outputs = block.produce(**produce_args)
+ try:
+ produce_args = self._get_block_args(block_name, block.produce_args, context)
+ outputs = block.produce(**produce_args)
- if block_name != last_block_name:
- output_dict = self._get_outputs(block_name, outputs, block.produce_output)
- context.update(output_dict)
+ if block_name != last_block_name:
+ output_dict = self._get_outputs(block_name, outputs, block.produce_output)
+ context.update(output_dict)
+
+ except Exception:
+ LOGGER.exception("Exception caught producing MLBlock %s", block_name)
+ raise
return outputs
diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py
index 337116e7..c6e50790 100644
--- a/mlblocks/primitives.py
+++ b/mlblocks/primitives.py
@@ -8,9 +8,12 @@
"""
import json
+import logging
import os
import sys
+LOGGER = logging.getLogger(__name__)
+
_PRIMITIVES_PATHS = [
os.path.join(os.getcwd(), 'mlblocks_primitives'),
os.path.join(sys.prefix, 'mlblocks_primitives'),
@@ -35,6 +38,7 @@ def add_primitives_path(path):
if not os.path.isdir(path):
raise ValueError('Invalid path: {}'.format(path))
+ LOGGER.debug('Adding new primitives path %s', path)
_PRIMITIVES_PATHS.insert(0, os.path.abspath(path))
@@ -73,6 +77,7 @@ def load_primitive(name):
json_path = os.path.join(base_path, name + '.json')
if os.path.isfile(json_path):
with open(json_path, 'r') as json_file:
+ LOGGER.debug('Loading primitive %s from %s', name, json_path)
return json.load(json_file)
raise ValueError("Unknown primitive: {}".format(name))
From a2cf239fd22d5d8c0e50eabef11d65f5f2c65bbd Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 9 Jan 2019 18:04:33 +0100
Subject: [PATCH 005/160] Filter conditionals from tunable hyperparameters
---
mlblocks/mlblock.py | 35 +++++--
tests/test_mlblock.py | 219 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 247 insertions(+), 7 deletions(-)
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 04a4bf55..618ebc75 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -110,6 +110,33 @@ def _extract_params(self, kwargs, hyperparameters):
return init_params, fit_params, produce_params
+ @staticmethod
+ def _filter_conditional(conditional, init_params):
+ condition = conditional['condition']
+ if condition not in init_params:
+ return conditional
+
+ condition_value = init_params[condition]
+ values = conditional['values']
+ conditioned = values.get(condition_value) or values.get('*')
+ if conditioned:
+ return conditioned
+
+ @classmethod
+ def _get_tunable(cls, hyperparameters, init_params):
+ tunable = dict()
+ for name, param in hyperparameters.get('tunable', dict()).items():
+ if name not in init_params:
+ if param['type'] == 'conditional':
+ param = cls._filter_conditional(param, init_params)
+ if param is not None:
+ tunable[name] = param
+
+ else:
+ tunable[name] = param
+
+ return tunable
+
def __init__(self, name, **kwargs):
self.name = name
@@ -136,13 +163,7 @@ def __init__(self, name, **kwargs):
self._fit_params = fit_params
self._produce_params = produce_params
- tunable = hyperparameters.get('tunable', dict())
- self._tunable = {
- name: param
- for name, param in tunable.items()
- if name not in init_params
- # TODO: filter conditionals
- }
+ self._tunable = self._get_tunable(hyperparameters, init_params)
default = {
name: param['default']
diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py
index abc235b0..970df5ed 100644
--- a/tests/test_mlblock.py
+++ b/tests/test_mlblock.py
@@ -23,6 +23,225 @@ class TestMLBlock(TestCase):
def test__extract_params(self):
pass
+ def test__get_tunable_no_conditionals(self):
+ """If there are no conditionals, tunables are returned unmodified."""
+
+ # setup
+ init_params = {
+ 'an_init_param': 'a_value'
+ }
+ hyperparameters = {
+ 'tunable': {
+ 'this_is_not_conditional': {
+ 'type': 'int',
+ 'default': 1
+ }
+ }
+ }
+
+ # run
+ tunable = MLBlock._get_tunable(hyperparameters, init_params)
+
+ # assert
+ expected = {
+ 'this_is_not_conditional': {
+ 'type': 'int',
+ 'default': 1
+ }
+ }
+ assert tunable == expected
+
+ def test__get_tunable_no_condition(self):
+ """If there is a conditiona but no condition, conditional is returned unmodified."""
+
+ # setup
+ init_params = {
+ 'an_init_param': 'a_value'
+ }
+ hyperparameters = {
+ 'tunable': {
+ 'this_is_not_conditional': {
+ 'type': 'int',
+ 'default': 1
+ },
+ 'this_is_conditional': {
+ 'type': 'conditional',
+ 'condition': 'a_condition',
+ 'default': 1,
+ 'values': {
+ 1: {
+ 'type': 'int',
+ 'default': 0
+ },
+ '*': {
+ 'type': 'str',
+ 'default': 'whatever'
+ }
+ }
+ }
+ }
+ }
+
+ # run
+ tunable = MLBlock._get_tunable(hyperparameters, init_params)
+
+ # assert
+ expected = {
+ 'this_is_not_conditional': {
+ 'type': 'int',
+ 'default': 1
+ },
+ 'this_is_conditional': {
+ 'type': 'conditional',
+ 'condition': 'a_condition',
+ 'default': 1,
+ 'values': {
+ 1: {
+ 'type': 'int',
+ 'default': 0
+ },
+ '*': {
+ 'type': 'str',
+ 'default': 'whatever'
+ }
+ }
+ }
+ }
+ assert tunable == expected
+
+ def test__get_tunable_condition_match(self):
+ """If there is a conditional and it matches, only that part is returned."""
+
+ # setup
+ init_params = {
+ 'a_condition': 'match'
+ }
+ hyperparameters = {
+ 'tunable': {
+ 'this_is_not_conditional': {
+ 'type': 'int',
+ 'default': 1
+ },
+ 'this_is_conditional': {
+ 'type': 'conditional',
+ 'condition': 'a_condition',
+ 'default': 1,
+ 'values': {
+ 'match': {
+ 'type': 'int',
+ 'default': 0
+ },
+ '*': {
+ 'type': 'str',
+ 'default': 'whatever'
+ }
+ }
+ }
+ }
+ }
+
+ # run
+ tunable = MLBlock._get_tunable(hyperparameters, init_params)
+
+ # assert
+ expected = {
+ 'this_is_not_conditional': {
+ 'type': 'int',
+ 'default': 1
+ },
+ 'this_is_conditional': {
+ 'type': 'int',
+ 'default': 0
+ }
+ }
+ assert tunable == expected
+
+ def test__get_tunable_condition_wildcard_match(self):
+ """If there is a conditional and it matches the wildcard, only that part is returned."""
+
+ # setup
+ init_params = {
+ 'a_condition': 'no_match'
+ }
+ hyperparameters = {
+ 'tunable': {
+ 'this_is_not_conditional': {
+ 'type': 'int',
+ 'default': 1
+ },
+ 'this_is_conditional': {
+ 'type': 'conditional',
+ 'condition': 'a_condition',
+ 'default': 1,
+ 'values': {
+ 'match': {
+ 'type': 'int',
+ 'default': 0
+ },
+ '*': {
+ 'type': 'str',
+ 'default': 'whatever'
+ }
+ }
+ }
+ }
+ }
+
+ # run
+ tunable = MLBlock._get_tunable(hyperparameters, init_params)
+
+ # assert
+ expected = {
+ 'this_is_not_conditional': {
+ 'type': 'int',
+ 'default': 1
+ },
+ 'this_is_conditional': {
+ 'type': 'str',
+ 'default': 'whatever'
+ }
+ }
+ assert tunable == expected
+
+ def test__get_tunable_condition_no_match(self):
+ """If there is a conditional without match or wildcard, it is not returned."""
+
+ # setup
+ init_params = {
+ 'a_condition': 'no_match'
+ }
+ hyperparameters = {
+ 'tunable': {
+ 'this_is_not_conditional': {
+ 'type': 'int',
+ 'default': 1
+ },
+ 'this_is_conditional': {
+ 'type': 'conditional',
+ 'condition': 'a_condition',
+ 'default': 1,
+ 'values': {
+ 'match': {
+ 'type': 'int',
+ 'default': 0
+ }
+ }
+ }
+ }
+ }
+
+ # run
+ tunable = MLBlock._get_tunable(hyperparameters, init_params)
+
+ # assert
+ expected = {
+ 'this_is_not_conditional': {
+ 'type': 'int',
+ 'default': 1
+ }
+ }
+ assert tunable == expected
+
@patch('mlblocks.mlblock.MLBlock.set_hyperparameters')
@patch('mlblocks.mlblock.import_object')
@patch('mlblocks.mlblock.load_primitive')
From 31b36d4779e8faeb38449025aec30b0b90c51378 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 10 Jan 2019 17:59:03 +0100
Subject: [PATCH 006/160] Changed slightly the behavior of the conditional
hyperparameters. Also include docs
---
docs/advanced_usage/hyperparameters.rst | 19 ++-
mlblocks/mlblock.py | 8 +-
tests/test_mlblock.py | 192 +++++++++++++++++-------
3 files changed, 156 insertions(+), 63 deletions(-)
diff --git a/docs/advanced_usage/hyperparameters.rst b/docs/advanced_usage/hyperparameters.rst
index bc31d4fd..71686ac5 100644
--- a/docs/advanced_usage/hyperparameters.rst
+++ b/docs/advanced_usage/hyperparameters.rst
@@ -165,6 +165,19 @@ Conditional Hyperparameters
In some other cases, the values that a hyperparameter can take depend on the value of another
one.
+For example, sometimes a primitive has a hyperparameter that specifies a kernel, and depending
+on the kernel used some other hyperparameters may be or not be used, or they might be able
+to take only some specific values.
+
+In this case, the ``type`` of the hyperparameter whose values depend on the other is specified
+as ``conditional``.
+In this case, two additional entries are required:
+
+* an entry called ``condition``, which specifies the name of the other hyperparameter, the value
+ of which is evaluated to decide which values this hyperparameter can take.
+* an additional subdictionary called ``values``, which relates the possible values that the
+ `condition` hyperparameter can have with the full specifications of the type and values that
+ this hyperparameter can take in each case.
Suppose, for example, that the primitive explained in the previous point does not expect
the ``mean``, ``min`` or ``max`` strings as values for the ``max_features`` hyperparameter,
@@ -190,7 +203,7 @@ In this case, the hyperparameters would be annotated like this::
}
"max_features_aggregation": {
"type": "conditional",
- "condition": "mas_features",
+ "condition": "max_features",
"default": null,
"values": {
"auto": {
@@ -202,6 +215,10 @@ In this case, the hyperparameters would be annotated like this::
}
}
+.. note:: Just like a regular hyperparameter, if there is no match the default entry is used.
+ In this example, the ``null`` value indicates that the hyperparameter needs to be
+ disabled if there is no match, but instead of it we could add there a full specification
+ of type, range and default value as a nested dictionary to be used by default.
.. _JSON Annotations: primitives.html#json-annotations
.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 618ebc75..a5cdb6a4 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -113,14 +113,14 @@ def _extract_params(self, kwargs, hyperparameters):
@staticmethod
def _filter_conditional(conditional, init_params):
condition = conditional['condition']
+ default = conditional.get('default')
+
if condition not in init_params:
- return conditional
+ return default
condition_value = init_params[condition]
values = conditional['values']
- conditioned = values.get(condition_value) or values.get('*')
- if conditioned:
- return conditioned
+ return values.get(condition_value, default)
@classmethod
def _get_tunable(cls, hyperparameters, init_params):
diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py
index 970df5ed..5273d40c 100644
--- a/tests/test_mlblock.py
+++ b/tests/test_mlblock.py
@@ -34,7 +34,8 @@ def test__get_tunable_no_conditionals(self):
'tunable': {
'this_is_not_conditional': {
'type': 'int',
- 'default': 1
+ 'default': 1,
+ 'range': [1, 10]
}
}
}
@@ -46,13 +47,14 @@ def test__get_tunable_no_conditionals(self):
expected = {
'this_is_not_conditional': {
'type': 'int',
- 'default': 1
+ 'default': 1,
+ 'range': [1, 10]
}
}
assert tunable == expected
def test__get_tunable_no_condition(self):
- """If there is a conditiona but no condition, conditional is returned unmodified."""
+ """If there is a conditional but no condition, the default is used."""
# setup
init_params = {
@@ -62,20 +64,27 @@ def test__get_tunable_no_condition(self):
'tunable': {
'this_is_not_conditional': {
'type': 'int',
- 'default': 1
+ 'default': 1,
+ 'range': [1, 10]
},
'this_is_conditional': {
'type': 'conditional',
'condition': 'a_condition',
- 'default': 1,
+ 'default': {
+ 'type': 'float',
+ 'default': 0.1,
+ 'values': [0, 1]
+ },
'values': {
- 1: {
- 'type': 'int',
- 'default': 0
- },
- '*': {
+ 'not_a_match': {
'type': 'str',
- 'default': 'whatever'
+ 'default': 'a',
+ 'values': ['a', 'b']
+ },
+ 'neither_a_match': {
+ 'type': 'int',
+ 'default': 0,
+ 'range': [1, 10]
}
}
}
@@ -89,22 +98,13 @@ def test__get_tunable_no_condition(self):
expected = {
'this_is_not_conditional': {
'type': 'int',
- 'default': 1
+ 'default': 1,
+ 'range': [1, 10]
},
'this_is_conditional': {
- 'type': 'conditional',
- 'condition': 'a_condition',
- 'default': 1,
- 'values': {
- 1: {
- 'type': 'int',
- 'default': 0
- },
- '*': {
- 'type': 'str',
- 'default': 'whatever'
- }
- }
+ 'type': 'float',
+ 'default': 0.1,
+ 'values': [0, 1]
}
}
assert tunable == expected
@@ -114,26 +114,33 @@ def test__get_tunable_condition_match(self):
# setup
init_params = {
- 'a_condition': 'match'
+ 'a_condition': 'a_match'
}
hyperparameters = {
'tunable': {
'this_is_not_conditional': {
'type': 'int',
- 'default': 1
+ 'default': 1,
+ 'range': [1, 10]
},
'this_is_conditional': {
'type': 'conditional',
'condition': 'a_condition',
- 'default': 1,
+ 'default': {
+ 'type': 'float',
+ 'default': 0.1,
+ 'values': [0, 1]
+ },
'values': {
- 'match': {
- 'type': 'int',
- 'default': 0
- },
- '*': {
+ 'not_a_match': {
'type': 'str',
- 'default': 'whatever'
+ 'default': 'a',
+ 'values': ['a', 'b']
+ },
+ 'a_match': {
+ 'type': 'int',
+ 'default': 0,
+ 'range': [1, 10]
}
}
}
@@ -147,40 +154,49 @@ def test__get_tunable_condition_match(self):
expected = {
'this_is_not_conditional': {
'type': 'int',
- 'default': 1
+ 'default': 1,
+ 'range': [1, 10]
},
'this_is_conditional': {
'type': 'int',
- 'default': 0
+ 'default': 0,
+ 'range': [1, 10]
}
}
assert tunable == expected
- def test__get_tunable_condition_wildcard_match(self):
- """If there is a conditional and it matches the wildcard, only that part is returned."""
+ def test__get_tunable_condition_no_match(self):
+ """If there is a conditional and it does not match, the default is used."""
# setup
init_params = {
- 'a_condition': 'no_match'
+ 'a_condition': 'not_a_match'
}
hyperparameters = {
'tunable': {
'this_is_not_conditional': {
'type': 'int',
- 'default': 1
+ 'default': 1,
+ 'range': [1, 10]
},
'this_is_conditional': {
'type': 'conditional',
'condition': 'a_condition',
- 'default': 1,
+ 'default': {
+ 'type': 'float',
+ 'default': 0.1,
+ 'values': [0, 1]
+ },
'values': {
- 'match': {
- 'type': 'int',
- 'default': 0
- },
- '*': {
+ 'also_not_a_match': {
'type': 'str',
- 'default': 'whatever'
+ 'default': 'a',
+ 'values': ['a', 'b']
+ },
+ 'neither_a_match': {
+ 'type': 'int',
+ 'default': 0,
+ 'range': [1, 10]
}
}
}
@@ -194,36 +210,45 @@ def test__get_tunable_condition_wildcard_match(self):
expected = {
'this_is_not_conditional': {
'type': 'int',
- 'default': 1
+ 'default': 1,
+ 'range': [1, 10]
},
'this_is_conditional': {
- 'type': 'str',
- 'default': 'whatever'
+ 'type': 'float',
+ 'default': 0.1,
+ 'values': [0, 1]
}
}
assert tunable == expected
- def test__get_tunable_condition_no_match(self):
- """If there is a conditional without match or wildcard, it is not returned."""
+ def test__get_tunable_condition_default_null(self):
+ """If there is no match and default is null (None), this param is not included."""
# setup
init_params = {
- 'a_condition': 'no_match'
+ 'a_condition': 'not_a_match'
}
hyperparameters = {
'tunable': {
'this_is_not_conditional': {
'type': 'int',
- 'default': 1
+ 'default': 1,
+ 'range': [1, 10]
},
'this_is_conditional': {
'type': 'conditional',
'condition': 'a_condition',
- 'default': 1,
+ 'default': None,
'values': {
- 'match': {
+ 'also_not_a_match': {
+ 'type': 'str',
+ 'default': 'a',
+ 'values': ['a', 'b']
+ },
+ 'neither_a_match': {
'type': 'int',
- 'default': 0
+ 'default': 0,
+ 'range': [1, 10]
}
}
}
@@ -237,7 +262,58 @@ def test__get_tunable_condition_no_match(self):
expected = {
'this_is_not_conditional': {
'type': 'int',
- 'default': 1
+ 'default': 1,
+ 'range': [1, 10]
+ }
+ }
+ assert tunable == expected
+
+ def test__get_tunable_condition_match_null(self):
+ """If there is a match and it is null (None), this param is not included.
+
+ This stands even if the default is not null.
+ """
+
+ # setup
+ init_params = {
+ 'a_condition': 'a_match'
+ }
+ hyperparameters = {
+ 'tunable': {
+ 'this_is_not_conditional': {
+ 'type': 'int',
+ 'default': 1,
+ 'range': [1, 10]
+ },
+ 'this_is_conditional': {
+ 'type': 'conditional',
+ 'condition': 'a_condition',
+ 'default': {
+ 'type': 'float',
+ 'default': 0.1,
+ 'values': [0, 1]
+ },
+ 'values': {
+ 'not_a_match': {
+ 'type': 'str',
+ 'default': 'a',
+ 'values': ['a', 'b']
+ },
+ 'a_match': None
+ }
+ }
+ }
+ }
+
+ # run
+ tunable = MLBlock._get_tunable(hyperparameters, init_params)
+
+ # assert
+ expected = {
+ 'this_is_not_conditional': {
+ 'type': 'int',
+ 'default': 1,
+ 'range': [1, 10]
}
}
assert tunable == expected
From 0ede2707da8e4d2866416cac28e2b03c69a68e47 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 10 Jan 2019 18:26:21 +0100
Subject: [PATCH 007/160] Release notes for v0.3.0
---
HISTORY.md | 7 +++++++
mlblocks/__init__.py | 2 +-
setup.cfg | 7 +++----
setup.py | 2 +-
4 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/HISTORY.md b/HISTORY.md
index d08624dc..a312c9cb 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,13 @@
Changelog
=========
+0.3.0 - New Primitives Discovery
+--------------------------------
+
+* New primitives discovery system based on `entry_points`.
+* Conditional Hyperparameters filtering in MLBlock initialization.
+* Improved logging and exception reporting.
+
0.2.4 - New Datasets and Unit Tests
-----------------------------------
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 43079986..3a9e6bcb 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -18,7 +18,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.2.5-dev'
+__version__ = '0.3.0-dev'
__all__ = [
'MLBlock', 'MLPipeline', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index fbc69b07..a9255027 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,15 +1,15 @@
[bumpversion]
-current_version = 0.2.5-dev
+current_version = 0.3.0-dev
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))?
-serialize =
+serialize =
{major}.{minor}.{patch}-{release}
{major}.{minor}.{patch}
[bumpversion:part:release]
optional_value = release
-values =
+values =
dev
release
@@ -45,4 +45,3 @@ collect_ignore = ['setup.py']
[tool:pylint]
good-names = X,y
-
diff --git a/setup.py b/setup.py
index 9d4b4cfc..5c21f44b 100644
--- a/setup.py
+++ b/setup.py
@@ -92,6 +92,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/HDI-Project/MLBlocks',
- version='0.2.5-dev',
+ version='0.3.0-dev',
zip_safe=False,
)
From bb0bb0d44bcc44e1517825409e1d092670ddde27 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 10 Jan 2019 18:33:45 +0100
Subject: [PATCH 008/160] =?UTF-8?q?Bump=20version:=200.3.0-dev=20=E2=86=92?=
=?UTF-8?q?=200.3.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 7 ++++---
setup.py | 2 +-
3 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 3a9e6bcb..93bd80bb 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -18,7 +18,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.3.0-dev'
+__version__ = '0.3.0'
__all__ = [
'MLBlock', 'MLPipeline', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index a9255027..3026d2ba 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,15 +1,15 @@
[bumpversion]
-current_version = 0.3.0-dev
+current_version = 0.3.0
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))?
-serialize =
+serialize =
{major}.{minor}.{patch}-{release}
{major}.{minor}.{patch}
[bumpversion:part:release]
optional_value = release
-values =
+values =
dev
release
@@ -45,3 +45,4 @@ collect_ignore = ['setup.py']
[tool:pylint]
good-names = X,y
+
diff --git a/setup.py b/setup.py
index 5c21f44b..a59a74f0 100644
--- a/setup.py
+++ b/setup.py
@@ -92,6 +92,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/HDI-Project/MLBlocks',
- version='0.3.0-dev',
+ version='0.3.0',
zip_safe=False,
)
From e1ca77bce3c4537c0800a4c1395e1b6bbde5465d Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 10 Jan 2019 18:34:07 +0100
Subject: [PATCH 009/160] =?UTF-8?q?Bump=20version:=200.3.0=20=E2=86=92=200?=
=?UTF-8?q?.3.1-dev?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 93bd80bb..cf326495 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -18,7 +18,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.3.0'
+__version__ = '0.3.1-dev'
__all__ = [
'MLBlock', 'MLPipeline', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 3026d2ba..e976dec7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.3.0
+current_version = 0.3.1-dev
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))?
diff --git a/setup.py b/setup.py
index a59a74f0..a8ac84d7 100644
--- a/setup.py
+++ b/setup.py
@@ -92,6 +92,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/HDI-Project/MLBlocks',
- version='0.3.0',
+ version='0.3.1-dev',
zip_safe=False,
)
From d3cbee730139b2d0117a1de1474a581844505196 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Fri, 19 Apr 2019 13:38:02 +0200
Subject: [PATCH 010/160] Initial implementation to work with intermediate
outputs
---
mlblocks/mlpipeline.py | 82 ++++++++++++++++++++++++++++++++++++------
1 file changed, 71 insertions(+), 11 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 058737ee..d5928b69 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -166,7 +166,7 @@ def _get_block_args(self, block_name, block_args, context):
return kwargs
- def _get_outputs(self, block_name, outputs, block_outputs):
+ def _extract_outputs(self, block_name, outputs, block_outputs):
# TODO: type validation and/or transformation should be done here
if not isinstance(outputs, tuple):
@@ -188,7 +188,40 @@ def _get_outputs(self, block_name, outputs, block_outputs):
return output_dict
- def fit(self, X=None, y=None, **kwargs):
+ def _get_block_name(self, index):
+ return list(self.blocks.keys())[index]
+
+ def _get_output_spec(self, output):
+ if output is None:
+ return None, None
+
+ if isinstance(output, int):
+ output = self._get_block_name(output)
+
+ if output in self.blocks:
+ return output, None
+
+ if '.' in output:
+ output_block, output_variable = output.rsplit('.', 1)
+ if output_block not in self.blocks:
+ raise ValueError('Unknown block name: {}'.format(output_block))
+
+ return output_block, output_variable
+
+ last_block_name = self._get_block_name(-1)
+ return last_block_name, output
+
+ def _get_output(self, output_variable, context):
+ if output_variable:
+ if output_variable not in context:
+ raise ValueError('Output variable {} not found in context'
+ .format(output_variable))
+
+ return context[output_variable]
+ else:
+ return context
+
+ def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs):
"""Fit the blocks of this pipeline.
Sequentially call the `fit` and the `produce` methods of each block,
@@ -213,8 +246,19 @@ def fit(self, X=None, y=None, **kwargs):
}
context.update(kwargs)
- last_block_name = list(self.blocks.keys())[-1]
+ output_block, output_variable = self._get_output_spec(output)
+ last_block_name = self._get_block_name(-1)
+
+ if isinstance(skip_to, int):
+ skip_to = self._get_block_name(skip_to)
+
for block_name, block in self.blocks.items():
+ if block_name == skip_to:
+ skip_to = False
+ elif skip_to:
+ LOGGER.debug("Skipping block %s fit", block_name)
+ continue
+
LOGGER.debug("Fitting block %s", block_name)
try:
fit_args = self._get_block_args(block_name, block.fit_args, context)
@@ -223,19 +267,22 @@ def fit(self, X=None, y=None, **kwargs):
LOGGER.exception("Exception caught fitting MLBlock %s", block_name)
raise
- if block_name != last_block_name:
+ if (block_name != last_block_name) or (block_name == output_block):
LOGGER.debug("Producing block %s", block_name)
try:
produce_args = self._get_block_args(block_name, block.produce_args, context)
outputs = block.produce(**produce_args)
- output_dict = self._get_outputs(block_name, outputs, block.produce_output)
+ output_dict = self._extract_outputs(block_name, outputs, block.produce_output)
context.update(output_dict)
except Exception:
LOGGER.exception("Exception caught producing MLBlock %s", block_name)
raise
- def predict(self, X=None, **kwargs):
+ if block_name == output_block:
+ return self._get_output(output_variable, context)
+
+ def predict(self, X=None, output='y', skip_to=None, **kwargs):
"""Produce predictions using the blocks of this pipeline.
Sequentially call the `produce` method of each block, capturing the
@@ -256,22 +303,35 @@ def predict(self, X=None, **kwargs):
}
context.update(kwargs)
- last_block_name = list(self.blocks.keys())[-1]
+ output_block, output_variable = self._get_output_spec(output)
+
+ if isinstance(skip_to, int):
+ skip_to = self._get_block_name(skip_to)
+
for block_name, block in self.blocks.items():
+ if block_name == skip_to:
+ skip_to = False
+ elif skip_to:
+ LOGGER.debug("Skipping block %s produce", block_name)
+ continue
+
LOGGER.debug("Producing block %s", block_name)
try:
produce_args = self._get_block_args(block_name, block.produce_args, context)
outputs = block.produce(**produce_args)
+ output_dict = self._extract_outputs(block_name, outputs, block.produce_output)
+ context.update(output_dict)
- if block_name != last_block_name:
- output_dict = self._get_outputs(block_name, outputs, block.produce_output)
- context.update(output_dict)
+ if block_name == output_block:
+ return self._get_output(output_variable, context)
except Exception:
LOGGER.exception("Exception caught producing MLBlock %s", block_name)
raise
- return outputs
+ if skip_to:
+ # We skipped all the blocks up to the end
+ raise ValueError('Unknown block name: {}'.format(skip_to))
def to_dict(self):
"""Return all the details of this MLPipeline in a dict.
From 59fae909d44afb78005425c6c4a24de567391eb5 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 6 May 2019 22:48:38 +0200
Subject: [PATCH 011/160] Update contributing guide to match the current
release workflow
---
CONTRIBUTING.rst | 34 ++++++++++++++++++----------------
1 file changed, 18 insertions(+), 16 deletions(-)
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index 2db74080..4fce53bf 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -172,24 +172,26 @@ The process of releasing a new version involves several steps combining both ``g
1. Merge what is in ``master`` branch into ``stable`` branch.
2. Update the version in ``setup.cfg``, ``mlblocks/__init__.py`` and ``HISTORY.md`` files.
-3. Create a new TAG pointing at the correspoding commit in ``stable`` branch.
+3. Create a new git tag pointing at the corresponding commit in ``stable`` branch.
4. Merge the new commit from ``stable`` into ``master``.
-5. Update the version in ``setup.cfg`` and ``mlblocks/__init__.py`` to open the next
- development interation.
+5. Update the version in ``setup.cfg`` and ``mlblocks/__init__.py``
+ to open the next development iteration.
-**Note:** Before starting the process, make sure that ``HISTORY.md`` has a section titled
-**Unreleased** with the list of changes that will be included in the new version, and that
-these changes are committed and available in ``master`` branch.
-Normally this is just a list of the Pull Requests that have been merged since the latest version.
+.. note:: Before starting the process, make sure that ``HISTORY.md`` has been updated with a new
+ entry that explains the changes that will be included in the new version.
+ Normally this is just a list of the Pull Requests that have been merged to master
+ since the last release.
-Once this is done, just run the following commands::
+Once this is done, run of the following commands:
+
+1. If you are releasing a patch version::
- git checkout stable
- git merge --no-ff master # This creates a merge commit
- bumpversion release # This creates a new commit and a TAG
- git push --tags origin stable
make release
- git checkout master
- git merge stable
- bumpversion --no-tag patch
- git push
+
+2. If you are releasing a minor version::
+
+ make release-minor
+
+3. If you are releasing a major version::
+
+ make release-major
From e768037076387fcb9a33e494c9c89421f0c657a8 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 6 May 2019 22:49:47 +0200
Subject: [PATCH 012/160] Update docs config
---
Makefile | 1 -
docs/changelog.rst | 2 +-
docs/conf.py | 20 +++++++-------------
setup.py | 1 -
4 files changed, 8 insertions(+), 16 deletions(-)
diff --git a/Makefile b/Makefile
index dc62e90d..c2d2aaa4 100644
--- a/Makefile
+++ b/Makefile
@@ -122,7 +122,6 @@ coverage: ## check code coverage quickly with the default Python
.PHONY: docs
docs: clean-docs ## generate Sphinx HTML documentation, including API docs
$(MAKE) -C docs html
- touch docs/_build/html/.nojekyll
.PHONY: view-docs
view-docs: docs ## view docs in browser
diff --git a/docs/changelog.rst b/docs/changelog.rst
index fcd2eb2d..d26e5be8 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1 +1 @@
-.. include:: ../HISTORY.md
+.. mdinclude:: ../HISTORY.md
diff --git a/docs/conf.py b/docs/conf.py
index 8659996f..9b4595ec 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -18,18 +18,9 @@
# relative to the documentation root, use os.path.abspath to make it
# absolute, like shown here.
-import os
-import sys
-
import sphinx_rtd_theme # For read the docs theme
-from recommonmark.parser import CommonMarkParser
-# from recommonmark.transform import AutoStructify
-
-# sys.path.insert(0, os.path.abspath('..'))
import mlblocks
-#
-# mlblocks.add_primitives_path('../mlblocks_primitives')
# -- General configuration ---------------------------------------------
@@ -40,8 +31,11 @@
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = [
- 'sphinx.ext.napoleon',
+ 'm2r',
+ 'sphinx.ext.autodoc',
'sphinx.ext.githubpages',
+ 'sphinx.ext.viewcode',
+ 'sphinx.ext.napoleon',
'sphinx.ext.graphviz',
'IPython.sphinxext.ipython_console_highlighting',
'IPython.sphinxext.ipython_directive',
@@ -56,9 +50,9 @@
# You can specify multiple suffix as a list of string:
source_suffix = ['.rst', '.md', '.ipynb']
-source_parsers = {
- '.md': CommonMarkParser,
-}
+# source_parsers = {
+# '.md': CommonMarkParser,
+# }
# The master toctree document.
master_doc = 'index'
diff --git a/setup.py b/setup.py
index a8ac84d7..f6991ab1 100644
--- a/setup.py
+++ b/setup.py
@@ -43,7 +43,6 @@
'graphviz==0.9',
'ipython==6.5.0',
'matplotlib==2.2.3',
- 'recommonmark>=0.4.0',
# style check
'flake8>=3.5.0',
From 080580d45c9b47680fbc31d30aee4e8478292711 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 6 May 2019 22:50:08 +0200
Subject: [PATCH 013/160] Remove spaces
---
setup.cfg | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/setup.cfg b/setup.cfg
index e976dec7..62ced521 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,13 +3,13 @@ current_version = 0.3.1-dev
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))?
-serialize =
+serialize =
{major}.{minor}.{patch}-{release}
{major}.{minor}.{patch}
[bumpversion:part:release]
optional_value = release
-values =
+values =
dev
release
@@ -45,4 +45,3 @@ collect_ignore = ['setup.py']
[tool:pylint]
good-names = X,y
-
From e25fa6d3ac3af2f20b205ed73d91d28124bc8c16 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 6 May 2019 22:50:32 +0200
Subject: [PATCH 014/160] ADd docstrings
---
mlblocks/mlpipeline.py | 127 ++++++++++++++++++++++++++++++++++++-----
1 file changed, 113 insertions(+), 14 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index d5928b69..abbac922 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -69,6 +69,7 @@ class MLPipeline():
"""
def _get_tunable_hyperparameters(self):
+ """Get the tunable hyperperparameters from all the blocks in this pipeline."""
tunable = {}
for block_name, block in self.blocks.items():
tunable[block_name] = block.get_tunable_hyperparameters()
@@ -140,6 +141,24 @@ def set_hyperparameters(self, hyperparameters):
self.blocks[block_name].set_hyperparameters(block_hyperparams)
def _get_block_args(self, block_name, block_args, context):
+ """Get the arguments expected by the block method from the context.
+
+ The arguments will be taken from the context using both the method
+ arguments specification and the `input_names` given when the pipeline
+ was created.
+
+ Args:
+ block_name (str): Name of this block. Used to find the corresponding
+ input_names.
+ block_args (list): list of method argument specifications from the
+ primitive.
+ context (dict): current context dictionary.
+
+ Returns:
+ dict:
+ A dictionary containing the argument names and values to pass
+ to the method.
+ """
# TODO: type validation and/or transformation should be done here
input_names = self.input_names.get(block_name, dict())
@@ -167,6 +186,7 @@ def _get_block_args(self, block_name, block_args, context):
return kwargs
def _extract_outputs(self, block_name, outputs, block_outputs):
+ """Extract the outputs of the method as a dict to be set into the context."""
# TODO: type validation and/or transformation should be done here
if not isinstance(outputs, tuple):
@@ -189,9 +209,36 @@ def _extract_outputs(self, block_name, outputs, block_outputs):
return output_dict
def _get_block_name(self, index):
+ """Get the name of the block in the `index` position."""
return list(self.blocks.keys())[index]
def _get_output_spec(self, output):
+ """Parsre the output specification and get a block name and a variable name.
+
+ The output specification can be of two types: int and str.
+
+ If it is an integer, it is interpreted as a block index, and the variable name
+ is considered to be ``None``, which means that the whole context will be returned.
+
+ If it is a string, it is interpreted as the block name, and it has to match a block
+ name exactly, including its hash and counter number ``#n``. Optionally, a variable
+ name can be passed at the end using a ``'.'`` as a separator.
+ In this case, the format of the string is `{block_name}.{variable_name}`. Note
+ that the block name can also contain dots, so only the leftmost dot will be
+ considered, and only if the complete string does not match exactly a block name.
+
+ Args:
+ output (str or int): Output specification as either a string or an integer.
+
+ Returns:
+ tuple:
+ The output is a tuple containing:
+ * block_name (str): name of the block from which the output will be
+ returned, including its counter number.
+ * variable_name (str): Name of the variable to extract from the context.
+ It can be ``None``, which means that the whole context is to be
+ returned.
+ """
if output is None:
return None, None
@@ -212,6 +259,10 @@ def _get_output_spec(self, output):
return last_block_name, output
def _get_output(self, output_variable, context):
+ """Get the specified output variable from the context.
+
+ If the variable name is ``None``, return the entire context.
+ """
if output_variable:
if output_variable not in context:
raise ValueError('Output variable {} not found in context'
@@ -221,7 +272,7 @@ def _get_output(self, output_variable, context):
else:
return context
- def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs):
+ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs):
"""Fit the blocks of this pipeline.
Sequentially call the `fit` and the `produce` methods of each block,
@@ -237,8 +288,32 @@ def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs):
X: Fit Data, which the pipeline will learn from.
y: Fit Data labels, which the pipeline will use to learn how to
behave.
+ output (str or int): Output specification, which can be a string or an integer.
+ If an integer is given, it is interpreted as the block number, and the whole
+ context after running the specified block will be returned.
+ If a string is given, it is expected to be the name of one block, including
+ its counter number at the end. Optionally, a variable name can be included
+ at the end after the counter number using a ``'.'`` as a separator between the
+ block name and the variable name. If the variable name is given, this will be
+ extracted from the context and returned. Otherwise, the whole context will
+ be returned.
+ start_on (str or int): Block index or block name to start processing from. The
+ value can either be an integer, which will be interpreted as a block index,
+ or the name of a block, including the conter number at the end.
+ If given, the execution of the pipeline will start on the specified block,
+ and all the blocks before that one will be skipped.
**kwargs: Any additional keyword arguments will be directly added
to the context dictionary and available for the blocks.
+
+ Returns:
+ None or dict or object:
+ * If no output is specified, nothing will be returned.
+ * If an output block has been specified without and output variable, the
+ context dictionary will be returned after the produce method of that block
+ has been called.
+ * If both an output block and an output variable have been specified,
+ the value of that variable from the context will extracted and returned
+ after the produce method of that block has been called.
"""
context = {
'X': X,
@@ -249,13 +324,13 @@ def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs):
output_block, output_variable = self._get_output_spec(output)
last_block_name = self._get_block_name(-1)
- if isinstance(skip_to, int):
- skip_to = self._get_block_name(skip_to)
+ if isinstance(start_on, int):
+ start_on = self._get_block_name(start_on)
for block_name, block in self.blocks.items():
- if block_name == skip_to:
- skip_to = False
- elif skip_to:
+ if block_name == start_on:
+ start_on = False
+ elif start_on:
LOGGER.debug("Skipping block %s fit", block_name)
continue
@@ -282,7 +357,7 @@ def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs):
if block_name == output_block:
return self._get_output(output_variable, context)
- def predict(self, X=None, output='y', skip_to=None, **kwargs):
+ def predict(self, X=None, output='y', start_on=None, **kwargs):
"""Produce predictions using the blocks of this pipeline.
Sequentially call the `produce` method of each block, capturing the
@@ -295,8 +370,32 @@ def predict(self, X=None, output='y', skip_to=None, **kwargs):
Args:
X: Data which the pipeline will use to make predictions.
+ output (str or int): Output specification, which can be a string or an integer.
+ If an integer is given, it is interpreted as the block number, and the whole
+ context after running the specified block will be returned.
+ If a string is given, it is expected to be the name of one block, including
+ its counter number at the end. Optionally, a variable name can be included
+ at the end after the counter number using a ``'.'`` as a separator between the
+ block name and the variable name. If the variable name is given, this will be
+ extracted from the context and returned. Otherwise, the whole context will
+ be returned.
+ start_on (str or int): Block index or block name to start processing from. The
+ value can either be an integer, which will be interpreted as a block index,
+ or the name of a block, including the conter number at the end.
+ If given, the execution of the pipeline will start on the specified block,
+ and all the blocks before that one will be skipped.
**kwargs: Any additional keyword arguments will be directly added
to the context dictionary and available for the blocks.
+
+ Returns:
+ None or dict or object:
+ * If no output is specified, the output of the last block will be returned.
+ * If an output block has been specified without and output variable, the
+ context dictionary will be returned after the produce method of that block
+ has been called.
+ * If both an output block and an output variable have been specified,
+ the value of that variable from the context will extracted and returned
+ after the produce method of that block has been called.
"""
context = {
'X': X
@@ -305,13 +404,13 @@ def predict(self, X=None, output='y', skip_to=None, **kwargs):
output_block, output_variable = self._get_output_spec(output)
- if isinstance(skip_to, int):
- skip_to = self._get_block_name(skip_to)
+ if isinstance(start_on, int):
+ start_on = self._get_block_name(start_on)
for block_name, block in self.blocks.items():
- if block_name == skip_to:
- skip_to = False
- elif skip_to:
+ if block_name == start_on:
+ start_on = False
+ elif start_on:
LOGGER.debug("Skipping block %s produce", block_name)
continue
@@ -329,9 +428,9 @@ def predict(self, X=None, output='y', skip_to=None, **kwargs):
LOGGER.exception("Exception caught producing MLBlock %s", block_name)
raise
- if skip_to:
+ if start_on:
# We skipped all the blocks up to the end
- raise ValueError('Unknown block name: {}'.format(skip_to))
+ raise ValueError('Unknown block name: {}'.format(start_on))
def to_dict(self):
"""Return all the details of this MLPipeline in a dict.
From 5e9be7aa7188d38ca6eafb684c24171b9e61f322 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 6 May 2019 22:51:09 +0200
Subject: [PATCH 015/160] Update primitive names to match the latest versions
of MLPrimitives
---
docs/getting_started/quickstart.rst | 2 +-
docs/pipeline_examples/graph.rst | 4 ++--
docs/pipeline_examples/text.rst | 22 +++++++++++-----------
3 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
index 2e00ece6..2115fcef 100644
--- a/docs/getting_started/quickstart.rst
+++ b/docs/getting_started/quickstart.rst
@@ -24,7 +24,7 @@ them to the `MLPipeline class`_:
from mlblocks import MLPipeline
primitives = [
- 'mlprimitives.feature_extraction.StringVectorizer',
+ 'mlprimitives.custom.feature_extraction.StringVectorizer',
'sklearn.ensemble.RandomForestClassifier',
]
pipeline = MLPipeline(primitives)
diff --git a/docs/pipeline_examples/graph.rst b/docs/pipeline_examples/graph.rst
index 5503e739..54ef85a1 100644
--- a/docs/pipeline_examples/graph.rst
+++ b/docs/pipeline_examples/graph.rst
@@ -39,7 +39,7 @@ additional information not found inside `X`.
primitives = [
'networkx.link_prediction_feature_extraction',
- 'mlprimitives.feature_extraction.CategoricalEncoder',
+ 'mlprimitives.custom.feature_extraction.CategoricalEncoder',
'sklearn.preprocessing.StandardScaler',
'xgboost.XGBClassifier'
]
@@ -69,6 +69,6 @@ additional information not found inside `X`.
.. _NetworkX Link Prediction: https://networkx.github.io/documentation/networkx-1.10/reference/algorithms.link_prediction.html
-.. _CategoricalEncoder from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.feature_extraction.CategoricalEncoder.json
+.. _CategoricalEncoder from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json
.. _StandardScaler from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
.. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
diff --git a/docs/pipeline_examples/text.rst b/docs/pipeline_examples/text.rst
index df8a9d5a..03472ea3 100644
--- a/docs/pipeline_examples/text.rst
+++ b/docs/pipeline_examples/text.rst
@@ -40,31 +40,31 @@ for later ones.
# set up the pipeline
primitives = [
- "mlprimitives.counters.UniqueCounter",
- "mlprimitives.text.TextCleaner",
- "mlprimitives.counters.VocabularyCounter",
+ "mlprimitives.custom.counters.UniqueCounter",
+ "mlprimitives.custom.text.TextCleaner",
+ "mlprimitives.custom.counters.VocabularyCounter",
"keras.preprocessing.text.Tokenizer",
"keras.preprocessing.sequence.pad_sequences",
"keras.Sequential.LSTMTextClassifier"
]
input_names = {
- "mlprimitives.counters.UniqueCounter#1": {
+ "mlprimitives.custom.counters.UniqueCounter#1": {
"X": "y"
}
}
output_names = {
- "mlprimitives.counters.UniqueCounter#1": {
+ "mlprimitives.custom.counters.UniqueCounter#1": {
"counts": "classes"
},
- "mlprimitives.counters.VocabularyCounter#1": {
+ "mlprimitives.custom.counters.VocabularyCounter#1": {
"counts": "vocabulary_size"
}
}
init_params = {
- "mlprimitives.counters.VocabularyCounter#1": {
+ "mlprimitives.custom.counters.VocabularyCounter#1": {
"add": 1
},
- "mlprimitives.text.TextCleaner#1": {
+ "mlprimitives.custom.text.TextCleaner#1": {
"language": "en"
},
"keras.preprocessing.sequence.pad_sequences#1": {
@@ -116,12 +116,12 @@ to encode all the string features, and go directly into the
nltk.download('stopwords')
primitives = [
- 'mlprimitives.text.TextCleaner',
- 'mlprimitives.feature_extraction.StringVectorizer',
+ 'mlprimitives.custom.text.TextCleaner',
+ 'mlprimitives.custom.feature_extraction.StringVectorizer',
'sklearn.ensemble.RandomForestClassifier',
]
init_params = {
- 'mlprimitives.text.TextCleaner': {
+ 'mlprimitives.custom.text.TextCleaner': {
'column': 'text',
'language': 'nl'
},
From 9f0ae6a3fa000896d8f530b72f6da46d23c31e4b Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 7 May 2019 17:12:33 +0200
Subject: [PATCH 016/160] Add random state to datasets get_splits
---
mlblocks/datasets.py | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py
index b5ed6b46..fb32df9c 100644
--- a/mlblocks/datasets.py
+++ b/mlblocks/datasets.py
@@ -141,7 +141,7 @@ def _get_split(data, index):
else:
return data[index]
- def get_splits(self, n_splits=1):
+ def get_splits(self, n_splits=1, random_state=0):
"""Return splits of this dataset ready for Cross Validation.
If n_splits is 1, a tuple containing the X for train and test
@@ -166,12 +166,13 @@ def get_splits(self, n_splits=1):
self.data,
self.target,
shuffle=self._shuffle,
- stratify=stratify
+ stratify=stratify,
+ random_state=random_state
)
else:
cv_class = StratifiedKFold if self._stratify else KFold
- cv = cv_class(n_splits=n_splits, shuffle=self._shuffle)
+ cv = cv_class(n_splits=n_splits, shuffle=self._shuffle, random_state=random_state)
splits = list()
for train, test in cv.split(self.data, self.target):
From 5aea64755b7b7f9b4e68f6faa9a0912c1a55033a Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 7 May 2019 17:12:58 +0200
Subject: [PATCH 017/160] Rename output and start arguments
---
mlblocks/mlpipeline.py | 40 ++++++++++++++++++++--------------------
1 file changed, 20 insertions(+), 20 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index abbac922..91e44341 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -272,7 +272,7 @@ def _get_output(self, output_variable, context):
else:
return context
- def fit(self, X=None, y=None, output=None, start_on=None, **kwargs):
+ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
"""Fit the blocks of this pipeline.
Sequentially call the `fit` and the `produce` methods of each block,
@@ -288,7 +288,7 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs):
X: Fit Data, which the pipeline will learn from.
y: Fit Data labels, which the pipeline will use to learn how to
behave.
- output (str or int): Output specification, which can be a string or an integer.
+ output_ (str or int): Output specification, which can be a string or an integer.
If an integer is given, it is interpreted as the block number, and the whole
context after running the specified block will be returned.
If a string is given, it is expected to be the name of one block, including
@@ -297,7 +297,7 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs):
block name and the variable name. If the variable name is given, this will be
extracted from the context and returned. Otherwise, the whole context will
be returned.
- start_on (str or int): Block index or block name to start processing from. The
+ start_ (str or int): Block index or block name to start processing from. The
value can either be an integer, which will be interpreted as a block index,
or the name of a block, including the conter number at the end.
If given, the execution of the pipeline will start on the specified block,
@@ -321,16 +321,16 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs):
}
context.update(kwargs)
- output_block, output_variable = self._get_output_spec(output)
+ output_block, output_variable = self._get_output_spec(output_)
last_block_name = self._get_block_name(-1)
- if isinstance(start_on, int):
- start_on = self._get_block_name(start_on)
+ if isinstance(start_, int):
+ start_ = self._get_block_name(start_)
for block_name, block in self.blocks.items():
- if block_name == start_on:
- start_on = False
- elif start_on:
+ if block_name == start_:
+ start_ = False
+ elif start_:
LOGGER.debug("Skipping block %s fit", block_name)
continue
@@ -357,7 +357,7 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs):
if block_name == output_block:
return self._get_output(output_variable, context)
- def predict(self, X=None, output='y', start_on=None, **kwargs):
+ def predict(self, X=None, output_='y', start_=None, **kwargs):
"""Produce predictions using the blocks of this pipeline.
Sequentially call the `produce` method of each block, capturing the
@@ -370,7 +370,7 @@ def predict(self, X=None, output='y', start_on=None, **kwargs):
Args:
X: Data which the pipeline will use to make predictions.
- output (str or int): Output specification, which can be a string or an integer.
+ output_ (str or int): Output specification, which can be a string or an integer.
If an integer is given, it is interpreted as the block number, and the whole
context after running the specified block will be returned.
If a string is given, it is expected to be the name of one block, including
@@ -379,7 +379,7 @@ def predict(self, X=None, output='y', start_on=None, **kwargs):
block name and the variable name. If the variable name is given, this will be
extracted from the context and returned. Otherwise, the whole context will
be returned.
- start_on (str or int): Block index or block name to start processing from. The
+ start_ (str or int): Block index or block name to start processing from. The
value can either be an integer, which will be interpreted as a block index,
or the name of a block, including the conter number at the end.
If given, the execution of the pipeline will start on the specified block,
@@ -402,15 +402,15 @@ def predict(self, X=None, output='y', start_on=None, **kwargs):
}
context.update(kwargs)
- output_block, output_variable = self._get_output_spec(output)
+ output_block, output_variable = self._get_output_spec(output_)
- if isinstance(start_on, int):
- start_on = self._get_block_name(start_on)
+ if isinstance(start_, int):
+ start_ = self._get_block_name(start_)
for block_name, block in self.blocks.items():
- if block_name == start_on:
- start_on = False
- elif start_on:
+ if block_name == start_:
+ start_ = False
+ elif start_:
LOGGER.debug("Skipping block %s produce", block_name)
continue
@@ -428,9 +428,9 @@ def predict(self, X=None, output='y', start_on=None, **kwargs):
LOGGER.exception("Exception caught producing MLBlock %s", block_name)
raise
- if start_on:
+ if start_:
# We skipped all the blocks up to the end
- raise ValueError('Unknown block name: {}'.format(start_on))
+ raise ValueError('Unknown block name: {}'.format(start_))
def to_dict(self):
"""Return all the details of this MLPipeline in a dict.
From 4607b3898aa9767774f872b936f2311492179746 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 7 May 2019 17:13:12 +0200
Subject: [PATCH 018/160] Add unit tests for partial outputs feature
---
tests/features/test_partial_outputs.py | 133 +++++++++++++++++++++++++
1 file changed, 133 insertions(+)
create mode 100644 tests/features/test_partial_outputs.py
diff --git a/tests/features/test_partial_outputs.py b/tests/features/test_partial_outputs.py
new file mode 100644
index 00000000..ce28d457
--- /dev/null
+++ b/tests/features/test_partial_outputs.py
@@ -0,0 +1,133 @@
+from unittest import TestCase
+from unittest.mock import Mock
+
+import numpy as np
+
+from mlblocks.datasets import load_iris
+from mlblocks.mlpipeline import MLPipeline
+
+
+def almost_equal(obj1, obj2):
+ if isinstance(obj1, dict):
+ if not isinstance(obj2, dict):
+ raise AssertionError("{} is not equal to {}".format(type(obj2), dict))
+
+ for key, value in obj1.items():
+ if key not in obj2:
+ raise AssertionError("{} not in {}".format(key, obj2))
+ almost_equal(value, obj2[key])
+
+ else:
+ np.testing.assert_almost_equal(obj1, obj2)
+
+
+class TestPartialOutputs(TestCase):
+ def setUp(self):
+ dataset = load_iris()
+
+ self.X_train, self.X_test, self.y_train, self.y_test = dataset.get_splits(1)
+
+ def test_fit_output(self):
+
+ # Setup variables
+ primitives = [
+ 'sklearn.preprocessing.StandardScaler',
+ 'sklearn.linear_model.LogisticRegression'
+ ]
+ pipeline = MLPipeline(primitives)
+
+ int_block = 0
+ invalid_int = 10
+ str_block = 'sklearn.preprocessing.StandardScaler#1'
+ invalid_block = 'InvalidBlockName'
+ str_block_variable = 'sklearn.preprocessing.StandardScaler#1.y'
+ invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid'
+
+ # Run
+ int_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=int_block)
+ str_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=str_block)
+ str_out_variable = pipeline.fit(self.X_train[0:5], self.y_train[0:5],
+ output_=str_block_variable)
+ no_output = pipeline.fit(self.X_train, self.y_train)
+
+ # Assert successful calls
+ X = np.array([
+ [0.71269665, -1.45152899, 0.55344946, 0.31740553],
+ [0.26726124, 1.23648766, -1.1557327, -1.0932857],
+ [-1.95991577, 0.967686, -1.1557327, -1.0932857],
+ [0.71269665, -0.645124, 0.39067021, 0.31740553],
+ [0.26726124, -0.10752067, 1.36734573, 1.55176035]
+ ])
+ y = np.array([1, 0, 0, 1, 2])
+ context = {
+ 'X': X,
+ 'y': y
+ }
+ almost_equal(context, int_out)
+ almost_equal(context, str_out)
+
+ almost_equal(y, str_out_variable)
+
+ assert no_output is None
+
+ # Run asserting exceptions
+ with self.assertRaises(IndexError):
+ pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_int)
+
+ with self.assertRaises(ValueError):
+ pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_block)
+
+ with self.assertRaises(ValueError):
+ pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_variable)
+
+ def test_fit_start(self):
+ # Setup variables
+ primitives = [
+ 'sklearn.preprocessing.StandardScaler',
+ 'sklearn.linear_model.LogisticRegression'
+ ]
+ pipeline = MLPipeline(primitives)
+
+ # Mock the first block
+ block_mock = Mock()
+ pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock
+
+ # Run first block
+ context = {
+ 'X': self.X_train,
+ 'y': self.y_train
+ }
+ int_start = 1
+ str_start = 'sklearn.linear_model.LogisticRegression#1'
+
+ pipeline.fit(start_=int_start, **context)
+ pipeline.fit(start_=str_start, **context)
+
+ # Assert that mock has not been called
+ block_mock.fit.assert_not_called()
+
+ def test_predict_start(self):
+ # Setup variables
+ primitives = [
+ 'sklearn.preprocessing.StandardScaler',
+ 'sklearn.linear_model.LogisticRegression'
+ ]
+ pipeline = MLPipeline(primitives)
+ pipeline.fit(self.X_train, self.y_train)
+
+ # Mock the first block
+ block_mock = Mock()
+ pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock
+
+ # Run first block
+ context = {
+ 'X': self.X_train,
+ }
+ int_start = 1
+ str_start = 'sklearn.linear_model.LogisticRegression#1'
+
+ pipeline.predict(start_=int_start, **context)
+ pipeline.predict(start_=str_start, **context)
+
+ # Assert that mock has not been called
+ block_mock.predict.assert_not_called()
From 980794b67165e286d49cb81cf742ea44fd760365 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 9 May 2019 15:14:23 +0200
Subject: [PATCH 019/160] Improve docstrings and add toc in autogenerated API
reference
---
Makefile | 5 +
docs/conf.py | 9 +-
mlblocks/datasets.py | 12 +-
mlblocks/mlblock.py | 79 +++++++------
mlblocks/mlpipeline.py | 256 +++++++++++++++++++++++++++--------------
mlblocks/primitives.py | 3 +-
setup.cfg | 6 +
setup.py | 4 +
8 files changed, 234 insertions(+), 140 deletions(-)
diff --git a/Makefile b/Makefile
index c2d2aaa4..6266033f 100644
--- a/Makefile
+++ b/Makefile
@@ -98,6 +98,11 @@ fix-lint: ## fix lint issues using autoflake, autopep8, and isort
autopep8 --in-place --recursive --aggressive tests
isort --apply --atomic --recursive tests
+.PHONY: lint-docs
+lint-docs: ## check docs formatting with doc8 and pydocstyle
+ doc8 mlblocks/
+ pydocstyle mlblocks/
+
# TEST TARGETS
diff --git a/docs/conf.py b/docs/conf.py
index 9b4595ec..95653914 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -39,8 +39,13 @@
'sphinx.ext.graphviz',
'IPython.sphinxext.ipython_console_highlighting',
'IPython.sphinxext.ipython_directive',
+ 'autodocsumm',
]
+autodoc_default_options = {
+ 'autosummary': True,
+}
+
ipython_execlines = ["import pandas as pd", "pd.set_option('display.width', 1000000)"]
# Add any paths that contain templates here, relative to this directory.
@@ -50,10 +55,6 @@
# You can specify multiple suffix as a list of string:
source_suffix = ['.rst', '.md', '.ipynb']
-# source_parsers = {
-# '.md': CommonMarkParser,
-# }
-
# The master toctree document.
master_doc = 'index'
diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py
index fb32df9c..0c69afda 100644
--- a/mlblocks/datasets.py
+++ b/mlblocks/datasets.py
@@ -100,6 +100,7 @@ class Dataset():
**kwargs: Any additional keyword argument passed on initialization will be made
available as instance attributes.
"""
+
def __init__(self, description, data, target, score, shuffle=True, stratify=False, **kwargs):
self.name = description.splitlines()[0]
@@ -115,10 +116,10 @@ def __init__(self, description, data, target, score, shuffle=True, stratify=Fals
self.__dict__.update(kwargs)
def score(self, *args, **kwargs):
- """Scoring function for this dataset.
+ r"""Scoring function for this dataset.
Args:
- \\*args, \\*\\*kwargs: Any given arguments and keyword arguments will be
+ \*args, \*\*kwargs: Any given arguments and keyword arguments will be
directly passed to the given scoring function.
Returns:
@@ -315,7 +316,6 @@ def load_dic28():
There exist 52,652 words (vertices in a network) having 2 up to 8 characters
in the dictionary. The obtained network has 89038 edges.
"""
-
dataset_path = _load('dic28')
X = _load_csv(dataset_path, 'data')
@@ -344,7 +344,6 @@ def load_nomination():
Data consists of one graph whose nodes contain two attributes, attr1 and attr2.
Associated with each node is a label that has to be learned and predicted.
"""
-
dataset_path = _load('nomination')
X = _load_csv(dataset_path, 'data')
@@ -363,7 +362,6 @@ def load_amazon():
co-purchased with product j, the graph contains an undirected edge from i to j.
Each product category provided by Amazon defines each ground-truth community.
"""
-
dataset_path = _load('amazon')
X = _load_csv(dataset_path, 'data')
@@ -383,7 +381,6 @@ def load_jester():
source: "University of California Berkeley, CA"
sourceURI: "http://eigentaste.berkeley.edu/dataset/"
"""
-
dataset_path = _load('jester')
X = _load_csv(dataset_path, 'data')
@@ -393,7 +390,7 @@ def load_jester():
def load_wikiqa():
- """A Challenge Dataset for Open-Domain Question Answering.
+ """Challenge Dataset for Open-Domain Question Answering.
WikiQA dataset is a publicly available set of question and sentence (QS) pairs,
collected and annotated for research on open-domain question answering.
@@ -401,7 +398,6 @@ def load_wikiqa():
source: "Microsoft"
sourceURI: "https://www.microsoft.com/en-us/research/publication/wikiqa-a-challenge-dataset-for-open-domain-question-answering/#"
""" # noqa
-
dataset_path = _load('wikiqa')
data = _load_csv(dataset_path, 'data', set_index=True)
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index a5cdb6a4..c3878e68 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -25,32 +25,34 @@ class MLBlock():
as wrapping them and providing a common interface to run them.
Attributes:
- name (str): Name given to this MLBlock.
- primitive (object): the actual function or instance which this MLBlock
- wraps.
- fit_args (dict): specification of the arguments expected by the `fit`
- method.
- fit_method (str): name of the primitive method to call on `fit`.
- `None` if the primitive is a function.
- produce_args (dict): specification of the arguments expected by the
- `predict` method.
- produce_output (dict): specification of the outputs of the `produce`
- method.
- produce_method (str): name of the primitive method to call on
- `produce`. `None` if the primitive is a function.
+ name (str):
+ Name given to this MLBlock.
+ primitive (object):
+ the actual function or instance which this MLBlock wraps.
+ fit_args (dict):
+ specification of the arguments expected by the `fit` method.
+ fit_method (str):
+ name of the primitive method to call on `fit`. `None` if the primitive is a function.
+ produce_args (dict):
+ specification of the arguments expected by the `predict` method.
+ produce_output (dict):
+ specification of the outputs of the `produce` method.
+ produce_method (str):
+ name of the primitive method to call on `produce`. `None` if the primitive is a
+ function.
Args:
- name (str): Name given to this MLBlock.
- **kwargs: Any additional arguments that will be used as
- hyperparameters or passed to the `fit` or `produce`
- methods.
+ name (str):
+ Name given to this MLBlock.
+ **kwargs:
+ Any additional arguments that will be used as hyperparameters or passed to the
+ `fit` or `produce` methods.
Raises:
- TypeError: A `TypeError` is raised if a required argument is not
- found within the `kwargs` or if an unexpected
- argument has been given.
- """
- # pylint: disable=too-many-instance-attributes
+ TypeError:
+ A `TypeError` is raised if a required argument is not found within the `kwargs`
+ or if an unexpected argument has been given.
+ """ # pylint: disable=too-many-instance-attributes
def _extract_params(self, kwargs, hyperparameters):
"""Extract init, fit and produce params from kwargs.
@@ -63,16 +65,16 @@ def _extract_params(self, kwargs, hyperparameters):
have been given and that nothing unexpected exists in the input.
Args:
- kwargs (dict): dict containing the Keyword arguments that have
- been passed to the `__init__` method upon
- initialization.
- hyperparameters (dict): hyperparameters dictionary, as found in
- the JSON annotation.
+ kwargs (dict):
+ dict containing the Keyword arguments that have been passed to the `__init__`
+ method upon initialization.
+ hyperparameters (dict):
+ hyperparameters dictionary, as found in the JSON annotation.
Raises:
- TypeError: A `TypeError` is raised if a required argument is not
- found in the `kwargs` dict, or if an unexpected
- argument has been given.
+ TypeError:
+ A `TypeError` is raised if a required argument is not found in the `kwargs` dict,
+ or if an unexpected argument has been given.
"""
init_params = dict()
fit_params = dict()
@@ -138,7 +140,6 @@ def _get_tunable(cls, hyperparameters, init_params):
return tunable
def __init__(self, name, **kwargs):
-
self.name = name
metadata = load_primitive(name)
@@ -174,6 +175,7 @@ def __init__(self, name, **kwargs):
self.set_hyperparameters(default)
def __str__(self):
+ """Return a string that represents this block."""
return 'MLBlock - {}'.format(self.name)
def get_tunable_hyperparameters(self):
@@ -210,9 +212,9 @@ def set_hyperparameters(self, hyperparameters):
If necessary, a new instance of the primitive is created.
Args:
- hyperparameters (dict): Dictionary containing as keys the name
- of the hyperparameters and as values
- the values to be used.
+ hyperparameters (dict):
+ Dictionary containing as keys the name of the hyperparameters and as
+ values the values to be used.
"""
self._hyperparameters.update(hyperparameters)
@@ -233,12 +235,13 @@ def fit(self, **kwargs):
the primitive is a simple function, this will be a noop.
Args:
- **kwargs: Any given keyword argument will be directly passed
- to the primitive fit method.
+ **kwargs:
+ Any given keyword argument will be directly passed to the primitive fit method.
Raises:
- TypeError: A `TypeError` might be raised if any argument not
- expected by the primitive fit method is given.
+ TypeError:
+ A `TypeError` might be raised if any argument not expected by the primitive fit
+ method is given.
"""
if self.fit_method is not None:
fit_args = self._fit_params.copy()
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 91e44341..eddb442e 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -34,38 +34,35 @@ class MLPipeline():
results, which will be returned as the prediction of the pipeline.
Attributes:
- primitives (list): List of the names of the primitives that compose
- this pipeline.
- blocks (list): OrderedDict of the block names and the corresponding
- MLBlock instances.
- init_params (dict): init_params dictionary, as given when the instance
- was created.
- input_names (dict): input_names dictionary, as given when the instance
- was created.
- output_names (dict): output_names dictionary, as given when the instance
- was created.
+ primitives (list):
+ List of the names of the primitives that compose this pipeline.
+ blocks (list):
+ OrderedDict of the block names and the corresponding MLBlock instances.
+ init_params (dict):
+ init_params dictionary, as given when the instance was created.
+ input_names (dict):
+ input_names dictionary, as given when the instance was created.
+ output_names (dict):
+ output_names dictionary, as given when the instance was created.
Args:
- primitives (list): List with the names of the primitives that will
- compose this pipeline.
- init_params (dict): dictionary containing initialization arguments to
- be passed when creating the MLBlocks instances.
- The dictionary keys must be the corresponding
- primitive names and the values must be another
- dictionary that will be passed as `**kargs` to the
- MLBlock instance.
- input_names (dict): dictionary that maps input variable names with the
- actual names expected by each primitive. This
- allows reusing the same input argument for multiple
- primitives that name it differently, as well as
- passing different values to primitives that expect
- arguments named similary.
- output_names (dict): dictionary that maps output variable names with
- the name these variables will be given when stored
- in the context dictionary. This allows storing
- the output of different primitives in different
- variables, even if the primitive output name is
- the same one.
+ primitives (list):
+ List with the names of the primitives that will compose this pipeline.
+ init_params (dict):
+ dictionary containing initialization arguments to be passed when creating the
+ MLBlocks instances. The dictionary keys must be the corresponding primitive names
+ and the values must be another dictionary that will be passed as `**kargs` to the
+ MLBlock instance.
+ input_names (dict):
+ dictionary that maps input variable names with the actual names expected by each
+ primitive. This allows reusing the same input argument for multiple primitives that
+ name it differently, as well as passing different values to primitives that expect
+ arguments named similary.
+ output_names (dict):
+ dictionary that maps output variable names with the name these variables will be
+ given when stored in the context dictionary. This allows storing the output of
+ different primitives in different variables, even if the primitive output name is
+ the same one.
"""
def _get_tunable_hyperparameters(self):
@@ -133,9 +130,9 @@ def set_hyperparameters(self, hyperparameters):
"""Set new hyperparameter values for some blocks.
Args:
- hyperparameters (dict): A dictionary containing the block names as
- keys and the new hyperparameters dictionary
- as values.
+ hyperparameters (dict):
+ A dictionary containing the block names as keys and the new hyperparameters
+ dictionary as values.
"""
for block_name, block_hyperparams in hyperparameters.items():
self.blocks[block_name].set_hyperparameters(block_hyperparams)
@@ -148,11 +145,12 @@ def _get_block_args(self, block_name, block_args, context):
was created.
Args:
- block_name (str): Name of this block. Used to find the corresponding
- input_names.
- block_args (list): list of method argument specifications from the
- primitive.
- context (dict): current context dictionary.
+ block_name (str):
+ Name of this block. Used to find the corresponding input_names.
+ block_args (list):
+ list of method argument specifications from the primitive.
+ context (dict):
+ current context dictionary.
Returns:
dict:
@@ -213,22 +211,40 @@ def _get_block_name(self, index):
return list(self.blocks.keys())[index]
def _get_output_spec(self, output):
- """Parsre the output specification and get a block name and a variable name.
+ """Parse the output specification and get a block name and a variable name.
The output specification can be of two types: int and str.
If it is an integer, it is interpreted as a block index, and the variable name
is considered to be ``None``, which means that the whole context will be returned.
- If it is a string, it is interpreted as the block name, and it has to match a block
- name exactly, including its hash and counter number ``#n``. Optionally, a variable
- name can be passed at the end using a ``'.'`` as a separator.
- In this case, the format of the string is `{block_name}.{variable_name}`. Note
- that the block name can also contain dots, so only the leftmost dot will be
- considered, and only if the complete string does not match exactly a block name.
+ If it is a string, it can be interpreted in three ways:
+
+ * **block name**: If the string matches a block name exactly, including
+ its hash and counter number ``#n`` at the end, the whole context will be
+ returned after that block is produced.
+ * **variable_name**: If the string does not match any block name and does
+ not contain any dot characted, ``'.'``, it will be considered a variable
+ name. In this case, the indicated variable will be extracted from the
+ context and returned after the last block has been produced.
+ * **block_name + variable_name**: If the complete string does not match a
+ block name but it contains at least one dot, ``'.'``, it will be split
+ in two parts on the last dot. If the first part of the string matches a
+ block name exactly, the second part of the string will be considered a
+ variable name, assuming the format ``{block_name}.{variable_name}``, and
+ the indicated variable will be extracted from the context and returned
+ after the block has been produced. Otherwise, if the extracted
+ ``block_name`` does not match a block name exactly, a ``ValueError``
+ will be raised.
Args:
- output (str or int): Output specification as either a string or an integer.
+ output (str or int):
+ Output specification as either a string or an integer.
+
+ Raises:
+ ValueError:
+ If the output string contains dots but it does not match a block
+ name exactly
Returns:
tuple:
@@ -239,15 +255,21 @@ def _get_output_spec(self, output):
It can be ``None``, which means that the whole context is to be
returned.
"""
+ # If None is given, both block and varialbe are None
if output is None:
return None, None
+ # If an int is given, it is a block index and there is no variable
if isinstance(output, int):
output = self._get_block_name(output)
+ return output, None
+ # If the string matches a block name, there is no variable
if output in self.blocks:
return output, None
+ # If there is at least one dot in the output, but it did not match
+ # a block name, it is considered to be {block_name}.{variable_name}
if '.' in output:
output_block, output_variable = output.rsplit('.', 1)
if output_block not in self.blocks:
@@ -255,6 +277,9 @@ def _get_output_spec(self, output):
return output_block, output_variable
+ # If the given string is not a block name and it has no dots,
+ # it is considered to be a variable name to be extracted
+ # from the context after the last block has been produced
last_block_name = self._get_block_name(-1)
return last_block_name, output
@@ -285,25 +310,48 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
`produce` calls will be taken.
Args:
- X: Fit Data, which the pipeline will learn from.
- y: Fit Data labels, which the pipeline will use to learn how to
- behave.
- output_ (str or int): Output specification, which can be a string or an integer.
- If an integer is given, it is interpreted as the block number, and the whole
- context after running the specified block will be returned.
- If a string is given, it is expected to be the name of one block, including
- its counter number at the end. Optionally, a variable name can be included
- at the end after the counter number using a ``'.'`` as a separator between the
- block name and the variable name. If the variable name is given, this will be
- extracted from the context and returned. Otherwise, the whole context will
- be returned.
- start_ (str or int): Block index or block name to start processing from. The
+ X:
+ Fit Data, which the pipeline will learn from.
+
+ y:
+ Fit Data labels, which the pipeline will use to learn how to
+ behave.
+
+ output_ (str or int or None):
+ Output specification, which can be a string or an integer or None.
+
+ * If it is None (default), nothing will be returned
+ * If an integer is given, it is interpreted as the block number, and the whole
+ context after running the specified block will be returned.
+ * If it is a string, it can be interpreted in three ways:
+
+ * **block name**: If the string matches a block name exactly, including
+ its hash and counter number ``#n`` at the end, the whole context will be
+ returned after that block is produced.
+ * **variable_name**: If the string does not match any block name and does
+ not contain any dot characted, ``'.'``, it will be considered a variable
+ name. In this case, the indicated variable will be extracted from the
+ context and returned after the last block has been produced.
+ * **block_name + variable_name**: If the complete string does not match a
+ block name but it contains at least one dot, ``'.'``, it will be split
+ in two parts on the last dot. If the first part of the string matches a
+ block name exactly, the second part of the string will be considered a
+ variable name, assuming the format ``{block_name}.{variable_name}``, and
+ the indicated variable will be extracted from the context and returned
+ after the block has been produced. Otherwise, if the extracted
+ ``block_name`` does not match a block name exactly, a ``ValueError``
+ will be raised.
+
+ start_ (str or int or None):
+ Block index or block name to start processing from. The
value can either be an integer, which will be interpreted as a block index,
or the name of a block, including the conter number at the end.
If given, the execution of the pipeline will start on the specified block,
and all the blocks before that one will be skipped.
- **kwargs: Any additional keyword arguments will be directly added
- to the context dictionary and available for the blocks.
+
+ **kwargs:
+ Any additional keyword arguments will be directly added
+ to the context dictionary and available for the blocks.
Returns:
None or dict or object:
@@ -328,11 +376,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
start_ = self._get_block_name(start_)
for block_name, block in self.blocks.items():
- if block_name == start_:
- start_ = False
- elif start_:
- LOGGER.debug("Skipping block %s fit", block_name)
- continue
+ if start_:
+ if block_name == start_:
+ start_ = False
+ else:
+ LOGGER.debug("Skipping block %s fit", block_name)
+ continue
LOGGER.debug("Fitting block %s", block_name)
try:
@@ -357,7 +406,11 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
if block_name == output_block:
return self._get_output(output_variable, context)
- def predict(self, X=None, output_='y', start_=None, **kwargs):
+ if start_:
+ # We skipped all the blocks up to the end
+ raise ValueError('Unknown block name: {}'.format(start_))
+
+ def predict(self, X=None, output_=None, start_=None, **kwargs):
"""Produce predictions using the blocks of this pipeline.
Sequentially call the `produce` method of each block, capturing the
@@ -369,23 +422,43 @@ def predict(self, X=None, output_='y', start_=None, **kwargs):
will be taken.
Args:
- X: Data which the pipeline will use to make predictions.
- output_ (str or int): Output specification, which can be a string or an integer.
- If an integer is given, it is interpreted as the block number, and the whole
- context after running the specified block will be returned.
- If a string is given, it is expected to be the name of one block, including
- its counter number at the end. Optionally, a variable name can be included
- at the end after the counter number using a ``'.'`` as a separator between the
- block name and the variable name. If the variable name is given, this will be
- extracted from the context and returned. Otherwise, the whole context will
- be returned.
- start_ (str or int): Block index or block name to start processing from. The
+ X:
+ Data which the pipeline will use to make predictions.
+
+ output_ (str or int or None):
+ Output specification, which can be a string or an integer or None.
+ * If it is None (default), the output of the last block will be returned.
+ * If an integer is given, it is interpreted as the block number, and the whole
+ context after running the specified block will be returned.
+ * If it is a string, it can be interpreted in three ways:
+
+ * **block name**: If the string matches a block name exactly, including
+ its hash and counter number ``#n`` at the end, the whole context will be
+ returned after that block is produced.
+ * **variable_name**: If the string does not match any block name and does
+ not contain any dot characted, ``'.'``, it will be considered a variable
+ name. In this case, the indicated variable will be extracted from the
+ context and returned after the last block has been produced.
+ * **block_name + variable_name**: If the complete string does not match a
+ block name but it contains at least one dot, ``'.'``, it will be split
+ in two parts on the last dot. If the first part of the string matches a
+ block name exactly, the second part of the string will be considered a
+ variable name, assuming the format ``{block_name}.{variable_name}``, and
+ the indicated variable will be extracted from the context and returned
+ after the block has been produced. Otherwise, if the extracted
+ ``block_name`` does not match a block name exactly, a ``ValueError``
+ will be raised.
+
+ start_ (str or int or None):
+ Block index or block name to start processing from. The
value can either be an integer, which will be interpreted as a block index,
or the name of a block, including the conter number at the end.
If given, the execution of the pipeline will start on the specified block,
and all the blocks before that one will be skipped.
- **kwargs: Any additional keyword arguments will be directly added
- to the context dictionary and available for the blocks.
+
+ **kwargs:
+ Any additional keyword arguments will be directly added
+ to the context dictionary and available for the blocks.
Returns:
None or dict or object:
@@ -408,11 +481,12 @@ def predict(self, X=None, output_='y', start_=None, **kwargs):
start_ = self._get_block_name(start_)
for block_name, block in self.blocks.items():
- if block_name == start_:
- start_ = False
- elif start_:
- LOGGER.debug("Skipping block %s produce", block_name)
- continue
+ if start_:
+ if block_name == start_:
+ start_ = False
+ else:
+ LOGGER.debug("Skipping block %s produce", block_name)
+ continue
LOGGER.debug("Producing block %s", block_name)
try:
@@ -432,6 +506,9 @@ def predict(self, X=None, output_='y', start_=None, **kwargs):
# We skipped all the blocks up to the end
raise ValueError('Unknown block name: {}'.format(start_))
+ if output_ is None:
+ return outputs
+
def to_dict(self):
"""Return all the details of this MLPipeline in a dict.
@@ -487,7 +564,8 @@ def save(self, path):
The content of the JSON file is the dict returned by the `to_dict` method.
Args:
- path (str): Path to the JSON file to write.
+ path (str):
+ Path to the JSON file to write.
"""
with open(path, 'w') as out_file:
json.dump(self.to_dict(), out_file, indent=4)
@@ -499,7 +577,8 @@ def from_dict(cls, metadata):
The dict structure is the same as the one created by the `to_dict` method.
Args:
- metadata (dict): Dictionary containing the pipeline specification.
+ metadata (dict):
+ Dictionary containing the pipeline specification.
Returns:
MLPipeline:
@@ -531,7 +610,8 @@ def load(cls, path):
The JSON file format is the same as the one created by the `to_dict` method.
Args:
- path (str): Path of the JSON file to load.
+ path (str):
+ Path of the JSON file to load.
Returns:
MLPipeline:
diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py
index 9bca6a5d..f2300f67 100644
--- a/mlblocks/primitives.py
+++ b/mlblocks/primitives.py
@@ -37,6 +37,7 @@ def add_primitives_path(path):
Raises:
ValueError: A `ValueError` will be raised if the path is not valid.
+
"""
if path not in _PRIMITIVES_PATHS:
if not os.path.isdir(path):
@@ -68,7 +69,6 @@ def get_primitives_paths():
list:
The list of folders.
"""
-
primitives_paths = list()
entry_points = pkg_resources.iter_entry_points('mlprimitives')
for entry_point in entry_points:
@@ -99,7 +99,6 @@ def load_primitive(name):
ValueError: A `ValueError` will be raised if the primitive cannot be
found.
"""
-
for base_path in get_primitives_paths():
parts = name.split('.')
number_of_parts = len(parts)
diff --git a/setup.cfg b/setup.cfg
index 62ced521..17244565 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -45,3 +45,9 @@ collect_ignore = ['setup.py']
[tool:pylint]
good-names = X,y
+
+[doc8]
+max-line-length = 99
+
+[pydocstyle]
+add-ignore = D403,D413,D105,D107
diff --git a/setup.py b/setup.py
index f6991ab1..c73eb0a6 100644
--- a/setup.py
+++ b/setup.py
@@ -59,6 +59,10 @@
# Advanced testing
'tox>=2.9.1',
'coverage>=4.5.1',
+
+ # Documentation style
+ 'doc8==0.8.0',
+ 'pydocstyle==3.0.0'
]
From 711201650e50e7ef0c3861347ac89abfa1a5c77d Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 9 May 2019 15:42:10 +0200
Subject: [PATCH 020/160] Add missing dependency
---
setup.py | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/setup.py b/setup.py
index c73eb0a6..f355be93 100644
--- a/setup.py
+++ b/setup.py
@@ -40,9 +40,10 @@
'm2r>=0.2.0',
'Sphinx>=1.7.1',
'sphinx_rtd_theme>=0.2.4',
- 'graphviz==0.9',
- 'ipython==6.5.0',
- 'matplotlib==2.2.3',
+ 'graphviz>=0.9',
+ 'ipython>=6.5.0',
+ 'matplotlib>=2.2.3',
+ 'autodocsumm>=0.1.10',
# style check
'flake8>=3.5.0',
@@ -61,8 +62,8 @@
'coverage>=4.5.1',
# Documentation style
- 'doc8==0.8.0',
- 'pydocstyle==3.0.0'
+ 'doc8>=0.8.0',
+ 'pydocstyle>=3.0.0'
]
From b26e527117cc45f94ed87c558e528a9a3276ff6f Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 16 May 2019 19:50:47 +0200
Subject: [PATCH 021/160] Move default and keyword arguments logic to MLBlock
---
mlblocks/mlblock.py | 55 ++++++++++++++++++++++++++++++++++++------
mlblocks/mlpipeline.py | 14 +----------
setup.py | 3 ++-
3 files changed, 50 insertions(+), 22 deletions(-)
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index c3878e68..80f5baa2 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -222,6 +222,43 @@ def set_hyperparameters(self, hyperparameters):
LOGGER.debug('Creating a new primitive instance for %s', self.name)
self.instance = self.primitive(**self._hyperparameters)
+ def _get_method_kwargs(self, kwargs, method_args):
+ """Prepare the kwargs for the method.
+
+ The kwargs dict will be altered according to the method_kwargs
+ specification to make them ready for the primitive method to
+ accept them.
+
+ Args:
+ kwargs (dict):
+ keyword arguments that have been passed to the block method.
+ method_args (list):
+ method arguments as specified in the JSON annotation.
+
+ Returns:
+ dict:
+ A dictionary containing the argument names and values to pass
+ to the primitive method.
+ """
+
+ method_kwargs = dict()
+ for arg in method_args:
+ name = arg['name']
+ keyword = arg.get('keyword', name)
+
+ if name in kwargs:
+ value = kwargs[name]
+
+ elif 'default' in arg:
+ value = arg['default']
+
+ else:
+ raise TypeError("missing expected argument '{}'".format(name))
+
+ method_kwargs[keyword] = value
+
+ return method_kwargs
+
def fit(self, **kwargs):
"""Call the fit method of the primitive.
@@ -244,9 +281,10 @@ def fit(self, **kwargs):
method is given.
"""
if self.fit_method is not None:
- fit_args = self._fit_params.copy()
- fit_args.update(kwargs)
- getattr(self.instance, self.fit_method)(**fit_args)
+ fit_kwargs = self._fit_params.copy()
+ fit_kwargs.update(kwargs)
+ fit_kwargs = self._get_method_kwargs(fit_kwargs, self.fit_args)
+ getattr(self.instance, self.fit_method)(**fit_kwargs)
def produce(self, **kwargs):
"""Call the primitive function, or the predict method of the primitive.
@@ -262,10 +300,11 @@ def produce(self, **kwargs):
The output of the call to the primitive function or primitive
produce method.
"""
- produce_args = self._produce_params.copy()
- produce_args.update(kwargs)
+ produce_kwargs = self._produce_params.copy()
+ produce_kwargs.update(kwargs)
+ produce_kwargs = self._get_method_kwargs(produce_kwargs, self.produce_args)
if self._class:
- return getattr(self.instance, self.produce_method)(**produce_args)
+ return getattr(self.instance, self.produce_method)(**produce_kwargs)
- produce_args.update(self._hyperparameters)
- return self.primitive(**produce_args)
+ produce_kwargs.update(self._hyperparameters)
+ return self.primitive(**produce_kwargs)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index eddb442e..9a0a109e 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -164,22 +164,10 @@ def _get_block_args(self, block_name, block_args, context):
kwargs = dict()
for arg in block_args:
name = arg['name']
- keyword = arg.get('keyword', name)
variable = input_names.get(name, name)
if variable in context:
- value = context[variable]
-
- elif 'default' in arg:
- value = arg['default']
-
- else:
- raise TypeError(
- "Expected argument '{}.{}' not found in context"
- .format(block_name, variable)
- )
-
- kwargs[keyword] = value
+ kwargs[name] = context[variable]
return kwargs
diff --git a/setup.py b/setup.py
index f355be93..9fca4dfa 100644
--- a/setup.py
+++ b/setup.py
@@ -15,13 +15,14 @@
install_requires = [
- 'mlprimitives>=0.1.3',
]
tests_require = [
'pytest>=3.4.2',
'pytest-cov>=2.6.0',
+ 'mlprimitives>=0.1.3,<0.2',
+ 'urllib3>=1.20,<1.25'
]
From 00f11647ab11456f5e2d6761cd36170796ac5250 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 21 May 2019 12:16:33 -0400
Subject: [PATCH 022/160] Load pipelines by name
---
mlblocks/__init__.py | 8 +-
mlblocks/discovery.py | 263 ++++++++++++++++++
mlblocks/mlblock.py | 2 +-
mlblocks/mlpipeline.py | 70 ++++-
mlblocks/primitives.py | 116 --------
tests/features/test_pipeline_loading.py | 106 +++++++
.../{test_primitives.py => test_discovery.py} | 40 +--
tests/test_mlpipeline.py | 6 +-
8 files changed, 460 insertions(+), 151 deletions(-)
create mode 100644 mlblocks/discovery.py
delete mode 100644 mlblocks/primitives.py
create mode 100644 tests/features/test_pipeline_loading.py
rename tests/{test_primitives.py => test_discovery.py} (60%)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index cf326495..37199013 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -10,9 +10,11 @@
* Documentation: https://HDI-Project.github.io/MLBlocks
"""
+from mlblocks.discovery import (
+ add_pipelines_path, add_primitives_path, get_pipelines_paths, get_primitives_paths,
+ load_pipeline, load_primitive)
from mlblocks.mlblock import MLBlock
from mlblocks.mlpipeline import MLPipeline
-from mlblocks.primitives import add_primitives_path, get_primitives_paths, load_primitive
__author__ = 'MIT Data To AI Lab'
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
@@ -21,6 +23,6 @@
__version__ = '0.3.1-dev'
__all__ = [
- 'MLBlock', 'MLPipeline', 'add_primitives_path',
- 'get_primitives_paths', 'load_primitive'
+ 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
+ 'get_pipelines_paths', 'get_primitives_paths', 'load_pipeline', 'load_primitive'
]
diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
new file mode 100644
index 00000000..78f12021
--- /dev/null
+++ b/mlblocks/discovery.py
@@ -0,0 +1,263 @@
+# -*- coding: utf-8 -*-
+
+"""
+Primitives and Pipelines discovery module.
+
+This module contains functions to load primitive and pipeline
+annotations, as well as to configure how MLBlocks finds the
+primitives and pipelines.
+"""
+
+import json
+import logging
+import os
+import sys
+
+import pkg_resources
+
+LOGGER = logging.getLogger(__name__)
+
+_PRIMITIVES_PATHS = [
+ os.path.join(os.getcwd(), 'mlprimitives'),
+ os.path.join(sys.prefix, 'mlprimitives'),
+ os.path.join(os.getcwd(), 'mlblocks_primitives'), # legacy
+ os.path.join(sys.prefix, 'mlblocks_primitives'), # legacy
+]
+_PIPELINES_PATHS = [
+ os.path.join(os.getcwd(), 'mlpipelines'),
+]
+
+
+def _add_lookup_path(path, paths):
+ """Add a new path to lookup.
+
+ The new path will be inserted in the first place of the list,
+ so any element found in this new folder will take precedence
+ over any other element with the same name that existed in the
+ system before.
+
+ Args:
+ path (str):
+ path to add
+
+ Raises:
+ ValueError:
+ A `ValueError` will be raised if the path is not valid.
+
+ """
+ if path not in paths:
+ if not os.path.isdir(path):
+ raise ValueError('Invalid path: {}'.format(path))
+
+ paths.insert(0, os.path.abspath(path))
+ return True
+
+
+def add_primitives_path(path):
+ """Add a new path to look for primitives.
+
+ The new path will be inserted in the first place of the list,
+ so any primitive found in this new folder will take precedence
+ over any other primitive with the same name that existed in the
+ system before.
+
+ Args:
+ path (str):
+ path to add
+
+ Raises:
+ ValueError:
+ A `ValueError` will be raised if the path is not valid.
+ """
+ added = _add_lookup_path(path, _PRIMITIVES_PATHS)
+ if added:
+ LOGGER.debug('New primitives path added: %s', path)
+
+
+def add_pipelines_path(path):
+ """Add a new path to look for pipelines.
+
+ The new path will be inserted in the first place of the list,
+ so any primitive found in this new folder will take precedence
+ over any other pipeline with the same name that existed in the
+ system before.
+
+ Args:
+ path (str):
+ path to add
+
+ Raises:
+ ValueError:
+ A `ValueError` will be raised if the path is not valid.
+ """
+ added = _add_lookup_path(path, _PIPELINES_PATHS)
+ if added:
+ LOGGER.debug('New pipelines path added: %s', path)
+
+
+def _get_lookup_paths(entry_point):
+ """Get the list of folders where elements will be looked for.
+
+ This list will include the value of any `entry_point` named `jsons_path` published under
+ the entry_point name.
+
+ An example of such an entry point would be::
+
+ entry_points = {
+ 'mlprimitives': [
+ 'jsons_path=some_module:SOME_VARIABLE'
+ ]
+ }
+
+ where the module `some_module` contains a variable such as::
+
+ SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
+
+ Args:
+ entry_point:
+ The name of the `entry_point` to look for.
+
+ Returns:
+ list:
+ The list of folders.
+ """
+ lookup_paths = list()
+ entry_points = pkg_resources.iter_entry_points(entry_point)
+ for entry_point in entry_points:
+ if entry_point.name == 'jsons_path':
+ path = entry_point.load()
+ lookup_paths.append(path)
+
+ return lookup_paths
+
+
+def get_primitives_paths():
+ """Get the list of folders where primitives will be looked for.
+
+ This list will include the value of any `entry_point` named `jsons_path` published under
+ the `mlprimitives` name.
+
+ An example of such an entry point would be::
+
+ entry_points = {
+ 'mlprimitives': [
+ 'jsons_path=some_module:SOME_VARIABLE'
+ ]
+ }
+
+ where the module `some_module` contains a variable such as::
+
+ SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
+
+ Returns:
+ list:
+ The list of folders.
+ """
+ return _PRIMITIVES_PATHS + _get_lookup_paths('mlprimitives')
+
+
+def get_pipelines_paths():
+ """Get the list of folders where pipelines will be looked for.
+
+ This list will include the value of any `entry_point` named `jsons_path` published under
+ the `mlpipelines` name.
+
+ An example of such an entry point would be::
+
+ entry_points = {
+ 'mlpipelines': [
+ 'jsons_path=some_module:SOME_VARIABLE'
+ ]
+ }
+
+ where the module `some_module` contains a variable such as::
+
+ SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
+
+ Returns:
+ list:
+ The list of folders.
+ """
+ return _PIPELINES_PATHS + _get_lookup_paths('mlpipelines')
+
+
+def _load(name, paths):
+ """Locate and load the JSON annotation in any of the given paths.
+
+ All the given paths will be scanned to find a JSON file with the given name,
+ and as soon as a JSON with the given name is found it is returned.
+
+ Args:
+ name (str):
+ name of the JSON to look for. The name should not contain the
+ `.json` extension, as it will be added dynamically.
+
+ Returns:
+ dict:
+ The content of the JSON annotation file loaded into a dict.
+ """
+ for base_path in paths:
+ parts = name.split('.')
+ number_of_parts = len(parts)
+
+ for folder_parts in range(number_of_parts):
+ folder = os.path.join(base_path, *parts[:folder_parts])
+ filename = '.'.join(parts[folder_parts:]) + '.json'
+ json_path = os.path.join(folder, filename)
+
+ if os.path.isfile(json_path):
+ with open(json_path, 'r') as json_file:
+ LOGGER.debug('Loading %s from %s', name, json_path)
+ return json.load(json_file)
+
+
+def load_primitive(name):
+ """Locate and load the primitive JSON annotation.
+
+ All the primitive paths will be scanned to find a JSON file with the given name,
+ and as soon as a JSON with the given name is found it is returned.
+
+ Args:
+ name (str):
+ name of the JSON to look for. The name should not contain the
+ `.json` extension, as it will be added dynamically.
+
+ Returns:
+ dict:
+ The content of the JSON annotation file loaded into a dict.
+
+ Raises:
+ ValueError:
+ A `ValueError` will be raised if the primitive cannot be found.
+ """
+ primitive = _load(name, get_primitives_paths())
+ if not primitive:
+ raise ValueError("Unknown primitive: {}".format(name))
+
+ return primitive
+
+
+def load_pipeline(name):
+ """Locate and load the pipeline JSON annotation.
+
+ All the pipeline paths will be scanned to find a JSON file with the given name,
+ and as soon as a JSON with the given name is found it is returned.
+
+ Args:
+ name (str):
+ name of the JSON to look for. The name should not contain the
+ `.json` extension, as it will be added dynamically.
+
+ Returns:
+ dict:
+ The content of the JSON annotation file loaded into a dict.
+
+ Raises:
+ ValueError:
+ A `ValueError` will be raised if the pipeline cannot be found.
+ """
+ pipeline = _load(name, get_pipelines_paths())
+ if not pipeline:
+ raise ValueError("Unknown pipeline: {}".format(name))
+
+ return pipeline
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 80f5baa2..1ab4a557 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -5,7 +5,7 @@
import importlib
import logging
-from mlblocks.primitives import load_primitive
+from mlblocks.discovery import load_primitive
LOGGER = logging.getLogger(__name__)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 9a0a109e..dc12b41f 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -6,6 +6,7 @@
import logging
from collections import Counter, OrderedDict
+from mlblocks.discovery import load_pipeline
from mlblocks.mlblock import MLBlock
LOGGER = logging.getLogger(__name__)
@@ -46,6 +47,12 @@ class MLPipeline():
output_names dictionary, as given when the instance was created.
Args:
+ pipeline (str, list, dict or MLPipeline):
+ The pipeline argument accepts four different types with different interpretations:
+ * `str`: the name of the pipeline to search and load.
+ * `list`: the primitives list.
+ * `dict`: a complete pipeline specification.
+ * `MLPipeline`: another pipeline to be cloned.
primitives (list):
List with the names of the primitives that will compose this pipeline.
init_params (dict):
@@ -73,10 +80,9 @@ def _get_tunable_hyperparameters(self):
return tunable
- def __init__(self, primitives, init_params=None, input_names=None, output_names=None):
- self.primitives = primitives
- self.init_params = init_params or dict()
- self.blocks = OrderedDict()
+ @staticmethod
+ def _build_blocks(primitives, init_params):
+ blocks = OrderedDict()
block_names_count = Counter()
for primitive in primitives:
@@ -84,23 +90,67 @@ def __init__(self, primitives, init_params=None, input_names=None, output_names=
block_names_count.update([primitive])
block_count = block_names_count[primitive]
block_name = '{}#{}'.format(primitive, block_count)
- block_params = self.init_params.get(block_name, dict())
+ block_params = init_params.get(block_name, dict())
if not block_params:
- block_params = self.init_params.get(primitive, dict())
+ block_params = init_params.get(primitive, dict())
if block_params and block_count > 1:
LOGGER.warning(("Non-numbered init_params are being used "
"for more than one block %s."), primitive)
block = MLBlock(primitive, **block_params)
- self.blocks[block_name] = block
+ blocks[block_name] = block
except Exception:
LOGGER.exception("Exception caught building MLBlock %s", primitive)
raise
- self.input_names = input_names or dict()
- self.output_names = output_names or dict()
- self._tunable_hyperparameters = self._get_tunable_hyperparameters()
+ return blocks
+
+ @staticmethod
+ def _get_pipeline_dict(pipeline, primitives):
+
+ if isinstance(pipeline, dict):
+ return pipeline
+
+ elif isinstance(pipeline, str):
+ return load_pipeline(pipeline)
+
+ elif isinstance(pipeline, MLPipeline):
+ return pipeline.to_dict()
+
+ elif isinstance(pipeline, list):
+ if primitives is not None:
+ raise ValueError('if `pipeline` is a `list`, `primitives` must be `None`')
+
+ return {'primitives': pipeline}
+
+ elif pipeline is None:
+ if primitives is None:
+ raise ValueError('Either `pipeline` or `primitives` must be not `None`.')
+
+ return dict()
+
+ def __init__(self, pipeline=None, primitives=None, init_params=None,
+ input_names=None, output_names=None):
+
+ pipeline = self._get_pipeline_dict(pipeline, primitives)
+
+ self.primitives = primitives or pipeline['primitives']
+ self.init_params = init_params or pipeline.get('init_params', dict())
+ self.blocks = self._build_blocks(self.primitives, self.init_params)
+
+ self.input_names = input_names or pipeline.get('input_names', dict())
+ self.output_names = output_names or pipeline.get('output_names', dict())
+
+ tunable = pipeline.get('tunable_hyperparameters')
+ if tunable is not None:
+ self._tunable_hyperparameters = tunable
+ else:
+ self._tunable_hyperparameters = self._get_tunable_hyperparameters()
+
+ hyperparameters = pipeline.get('hyperparameters')
+ if hyperparameters:
+ self.set_hyperparameters(hyperparameters)
def get_tunable_hyperparameters(self):
"""Get the tunable hyperparamters of each block.
diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py
deleted file mode 100644
index f2300f67..00000000
--- a/mlblocks/primitives.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# -*- coding: utf-8 -*-
-
-"""
-Primitives module.
-
-This module contains functions to load primitive annotations,
-as well as to configure how MLBlocks finds the primitives.
-"""
-
-import json
-import logging
-import os
-import sys
-
-import pkg_resources
-
-LOGGER = logging.getLogger(__name__)
-
-_PRIMITIVES_PATHS = [
- os.path.join(os.getcwd(), 'mlprimitives'),
- os.path.join(sys.prefix, 'mlprimitives'),
- os.path.join(os.getcwd(), 'mlblocks_primitives'), # legacy
- os.path.join(sys.prefix, 'mlblocks_primitives'), # legacy
-]
-
-
-def add_primitives_path(path):
- """Add a new path to look for primitives.
-
- The new path will be inserted in the first place of the list,
- so any primitive found in this new folder will take precedence
- over any other primitive with the same name that existed in the
- system before.
-
- Args:
- path (str): path to add
-
- Raises:
- ValueError: A `ValueError` will be raised if the path is not valid.
-
- """
- if path not in _PRIMITIVES_PATHS:
- if not os.path.isdir(path):
- raise ValueError('Invalid path: {}'.format(path))
-
- LOGGER.debug('Adding new primitives path %s', path)
- _PRIMITIVES_PATHS.insert(0, os.path.abspath(path))
-
-
-def get_primitives_paths():
- """Get the list of folders where the primitives will be looked for.
-
- This list will include the value of any `entry_point` named `jsons_path` published under
- the name `mlprimitives`.
-
- An example of such an entry point would be::
-
- entry_points = {
- 'mlprimitives': [
- 'jsons_path=some_module:SOME_VARIABLE'
- ]
- }
-
- where the module `some_module` contains a variable such as::
-
- SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
-
- Returns:
- list:
- The list of folders.
- """
- primitives_paths = list()
- entry_points = pkg_resources.iter_entry_points('mlprimitives')
- for entry_point in entry_points:
- if entry_point.name == 'jsons_path':
- path = entry_point.load()
- primitives_paths.append(path)
-
- return _PRIMITIVES_PATHS + primitives_paths
-
-
-def load_primitive(name):
- """Locate and load the JSON annotation of the given primitive.
-
- All the paths found in PRIMTIVE_PATHS will be scanned to find a JSON file
- with the given name, and as soon as a JSON with the given name is found it
- is returned.
-
- Args:
- name (str): name of the primitive to look for. The name should
- correspond to the primitive, not to the filename, as the
- `.json` extension will be added dynamically.
-
- Returns:
- dict:
- The content of the JSON annotation file loaded into a dict.
-
- Raises:
- ValueError: A `ValueError` will be raised if the primitive cannot be
- found.
- """
- for base_path in get_primitives_paths():
- parts = name.split('.')
- number_of_parts = len(parts)
-
- for folder_parts in range(number_of_parts):
- folder = os.path.join(base_path, *parts[:folder_parts])
- filename = '.'.join(parts[folder_parts:]) + '.json'
- json_path = os.path.join(folder, filename)
-
- if os.path.isfile(json_path):
- with open(json_path, 'r') as json_file:
- LOGGER.debug('Loading primitive %s from %s', name, json_path)
- return json.load(json_file)
-
- raise ValueError("Unknown primitive: {}".format(name))
diff --git a/tests/features/test_pipeline_loading.py b/tests/features/test_pipeline_loading.py
new file mode 100644
index 00000000..bc344d63
--- /dev/null
+++ b/tests/features/test_pipeline_loading.py
@@ -0,0 +1,106 @@
+from unittest import TestCase
+from unittest.mock import Mock
+
+from mlblocks import MLPipeline
+
+
+class TestMLPipeline(TestCase):
+
+ def test_dict(self):
+ pipeline_dict = {
+ 'primitives': [
+ 'sklearn.ensemble.RandomForestClassifier'
+ ],
+ 'init_params': {
+ 'sklearn.ensemble.RandomForest#1': {
+ 'n_estimators': 500
+ }
+ },
+ 'input_names': {
+ 'sklearn.ensemble.RandomForest#1': {
+ 'X': 'X1'
+ }
+ },
+ 'output_names': {
+ 'sklearn.ensemble.RandomForest#1': {
+ 'y': 'y1'
+ }
+ }
+ }
+
+ pipeline = MLPipeline(pipeline_dict)
+
+ assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier']
+ assert pipeline.init_params == {
+ 'sklearn.ensemble.RandomForest#1': {
+ 'n_estimators': 500
+ }
+ }
+ assert pipeline.input_names == {
+ 'sklearn.ensemble.RandomForest#1': {
+ 'X': 'X1'
+ }
+ }
+ assert pipeline.output_names == {
+ 'sklearn.ensemble.RandomForest#1': {
+ 'y': 'y1'
+ }
+ }
+
+ def test_list(self):
+ primitives = [
+ 'sklearn.ensemble.RandomForestClassifier'
+ ]
+ init_params = {
+ 'sklearn.ensemble.RandomForest#1': {
+ 'n_estimators': 500
+ }
+ }
+
+ pipeline = MLPipeline(primitives, init_params=init_params)
+
+ assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier']
+ assert pipeline.init_params == {
+ 'sklearn.ensemble.RandomForest#1': {
+ 'n_estimators': 500
+ }
+ }
+
+ def test_none(self):
+ primitives = [
+ 'sklearn.ensemble.RandomForestClassifier'
+ ]
+ init_params = {
+ 'sklearn.ensemble.RandomForest#1': {
+ 'n_estimators': 500
+ }
+ }
+
+ pipeline = MLPipeline(primitives=primitives, init_params=init_params)
+
+ assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier']
+ assert pipeline.init_params == {
+ 'sklearn.ensemble.RandomForest#1': {
+ 'n_estimators': 500
+ }
+ }
+
+ def test_mlpipeline(self):
+ primitives = [
+ 'sklearn.ensemble.RandomForestClassifier'
+ ]
+ init_params = {
+ 'sklearn.ensemble.RandomForest#1': {
+ 'n_estimators': 500
+ }
+ }
+
+ pipeline = MLPipeline(primitives=primitives, init_params=init_params)
+ pipeline2 = MLPipeline(pipeline)
+
+ assert pipeline2.primitives == ['sklearn.ensemble.RandomForestClassifier']
+ assert pipeline2.init_params == {
+ 'sklearn.ensemble.RandomForest#1': {
+ 'n_estimators': 500
+ }
+ }
diff --git a/tests/test_primitives.py b/tests/test_discovery.py
similarity index 60%
rename from tests/test_primitives.py
rename to tests/test_discovery.py
index 1afd17b6..3a7c3321 100644
--- a/tests/test_primitives.py
+++ b/tests/test_discovery.py
@@ -9,57 +9,57 @@
import pytest
from pkg_resources import Distribution, EntryPoint
-from mlblocks import primitives
+from mlblocks import discovery
FAKE_MLPRIMITIVES_PATH = 'this/is/a/fake'
-@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
+@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
def test_add_primitives_path_do_nothing():
- primitives.add_primitives_path('a')
+ discovery.add_primitives_path('a')
- assert primitives._PRIMITIVES_PATHS == ['a', 'b']
+ assert discovery._PRIMITIVES_PATHS == ['a', 'b']
-@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
+@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
def test_add_primitives_path_exception():
invalid_path = str(uuid.uuid4())
with pytest.raises(ValueError):
- primitives.add_primitives_path(invalid_path)
+ discovery.add_primitives_path(invalid_path)
-@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
+@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
def test_add_primitives_path():
- primitives.add_primitives_path('tests')
+ discovery.add_primitives_path('tests')
expected_path = os.path.abspath('tests')
- assert primitives._PRIMITIVES_PATHS == [expected_path, 'a', 'b']
+ assert discovery._PRIMITIVES_PATHS == [expected_path, 'a', 'b']
-@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
-@patch('mlblocks.primitives.pkg_resources.iter_entry_points')
+@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
+@patch('mlblocks.discovery.pkg_resources.iter_entry_points')
def test_get_primitives_paths_no_entry_points(iep_mock):
# setup
iep_mock.return_value == []
# run
- paths = primitives.get_primitives_paths()
+ paths = discovery.get_primitives_paths()
# assert
assert paths == ['a', 'b']
iep_mock.assert_called_once_with('mlprimitives')
-@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
-@patch('mlblocks.primitives.pkg_resources.iter_entry_points')
+@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
+@patch('mlblocks.discovery.pkg_resources.iter_entry_points')
def test_get_primitives_paths_entry_points(iep_mock):
# setup
something_else_ep = EntryPoint('something_else', 'mlblocks.__version__')
jsons_path_ep = EntryPoint(
'jsons_path',
- 'tests.test_primitives',
+ 'tests.test_discovery',
attrs=['FAKE_MLPRIMITIVES_PATH'],
dist=Distribution()
)
@@ -69,7 +69,7 @@ def test_get_primitives_paths_entry_points(iep_mock):
]
# run
- paths = primitives.get_primitives_paths()
+ paths = discovery.get_primitives_paths()
# assert
expected = [
@@ -82,10 +82,10 @@ def test_get_primitives_paths_entry_points(iep_mock):
iep_mock.assert_called_once_with('mlprimitives')
-@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
+@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
def test_load_primitive_value_error():
with pytest.raises(ValueError):
- primitives.load_primitive('invalid.primitive')
+ discovery.load_primitive('invalid.primitive')
def test_load_primitive_success():
@@ -95,11 +95,11 @@ def test_load_primitive_success():
}
with tempfile.TemporaryDirectory() as tempdir:
- primitives.add_primitives_path(tempdir)
+ discovery.add_primitives_path(tempdir)
primitive_path = os.path.join(tempdir, 'temp.primitive.json')
with open(primitive_path, 'w') as primitive_file:
json.dump(primitive, primitive_file, indent=4)
- loaded = primitives.load_primitive('temp.primitive')
+ loaded = discovery.load_primitive('temp.primitive')
assert primitive == loaded
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 2fa6d097..741be194 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -39,7 +39,11 @@ def test___init__(self, mlblock_mock, logger_mock):
}
expected_input_names = input_names.copy()
- mlpipeline = MLPipeline(primitives, init_params, input_names)
+ mlpipeline = MLPipeline(
+ primitives=primitives,
+ init_params=init_params,
+ input_names=input_names
+ )
assert mlpipeline.primitives == expected_primitives
assert mlpipeline.init_params == expected_init_params
From eb36fcb12f79401c776b0269be35b7c64e1ea22d Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 21 May 2019 13:54:02 -0400
Subject: [PATCH 023/160] Fix docs
---
docs/advanced_usage/pipelines.rst | 2 +-
docs/api/mlblocks.discovery.rst | 5 +++
docs/api/mlblocks.primitives.rst | 5 ---
docs/getting_started/quickstart.rst | 2 +-
docs/index.rst | 2 +-
mlblocks/discovery.py | 34 +++++++--------
mlblocks/mlblock.py | 31 +++++++-------
mlblocks/mlpipeline.py | 57 +++++++++++++------------
tests/features/test_pipeline_loading.py | 1 -
9 files changed, 70 insertions(+), 69 deletions(-)
create mode 100644 docs/api/mlblocks.discovery.rst
delete mode 100644 docs/api/mlblocks.primitives.rst
diff --git a/docs/advanced_usage/pipelines.rst b/docs/advanced_usage/pipelines.rst
index cc7ccc49..33d57cdc 100644
--- a/docs/advanced_usage/pipelines.rst
+++ b/docs/advanced_usage/pipelines.rst
@@ -86,7 +86,7 @@ This can be done by passing an extra dictionary to the MLPipeline when it is cre
'n_estimators': 100
}
}
- pipeline = MLPipeline(primitives, init_params)
+ pipeline = MLPipeline(primitives, init_params=init_params)
This dictionary must have as keys the name of the blocks that the arguments belong to, and
as values the dictionary that contains the argument names and their values.
diff --git a/docs/api/mlblocks.discovery.rst b/docs/api/mlblocks.discovery.rst
new file mode 100644
index 00000000..c9109130
--- /dev/null
+++ b/docs/api/mlblocks.discovery.rst
@@ -0,0 +1,5 @@
+mlblocks.discovery
+==================
+
+.. automodule:: mlblocks.discovery
+ :members:
diff --git a/docs/api/mlblocks.primitives.rst b/docs/api/mlblocks.primitives.rst
deleted file mode 100644
index d625c774..00000000
--- a/docs/api/mlblocks.primitives.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-mlblocks.primitives
-===================
-
-.. automodule:: mlblocks.primitives
- :members:
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
index 2115fcef..c3edf475 100644
--- a/docs/getting_started/quickstart.rst
+++ b/docs/getting_started/quickstart.rst
@@ -38,7 +38,7 @@ Optionally, specific `hyperparameters`_ can be also set by specifying them in a
'n_estimators': 100
}
}
- pipeline = MLPipeline(primitives, hyperparameters)
+ pipeline = MLPipeline(primitives, init_params=hyperparameters)
Once the pipeline has been instantiated, we can easily see what `hyperparameters`_ have been set
for each block, by calling the `get_hyperparameters method`_.
diff --git a/docs/index.rst b/docs/index.rst
index 2bb4c5a9..c3655b3c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -74,7 +74,7 @@ integrate with deep learning libraries.
api/mlblocks
api/mlblocks.datasets
- api/mlblocks.primitives
+ api/mlblocks.discovery
.. toctree::
:caption: Resources
diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index 78f12021..1f952b81 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -42,7 +42,7 @@ def _add_lookup_path(path, paths):
Raises:
ValueError:
- A `ValueError` will be raised if the path is not valid.
+ A ``ValueError`` will be raised if the path is not valid.
"""
if path not in paths:
@@ -67,7 +67,7 @@ def add_primitives_path(path):
Raises:
ValueError:
- A `ValueError` will be raised if the path is not valid.
+ A ``ValueError`` will be raised if the path is not valid.
"""
added = _add_lookup_path(path, _PRIMITIVES_PATHS)
if added:
@@ -88,7 +88,7 @@ def add_pipelines_path(path):
Raises:
ValueError:
- A `ValueError` will be raised if the path is not valid.
+ A ``ValueError`` will be raised if the path is not valid.
"""
added = _add_lookup_path(path, _PIPELINES_PATHS)
if added:
@@ -98,7 +98,7 @@ def add_pipelines_path(path):
def _get_lookup_paths(entry_point):
"""Get the list of folders where elements will be looked for.
- This list will include the value of any `entry_point` named `jsons_path` published under
+ This list will include the value of any ``entry_point`` named ``jsons_path`` published under
the entry_point name.
An example of such an entry point would be::
@@ -109,13 +109,13 @@ def _get_lookup_paths(entry_point):
]
}
- where the module `some_module` contains a variable such as::
+ where the module ``some_module`` contains a variable such as::
SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
Args:
entry_point:
- The name of the `entry_point` to look for.
+ The name of the ``entry_point`` to look for.
Returns:
list:
@@ -134,8 +134,8 @@ def _get_lookup_paths(entry_point):
def get_primitives_paths():
"""Get the list of folders where primitives will be looked for.
- This list will include the value of any `entry_point` named `jsons_path` published under
- the `mlprimitives` name.
+ This list will include the value of any ``entry_point`` named ``jsons_path`` published under
+ the ``mlprimitives`` name.
An example of such an entry point would be::
@@ -145,7 +145,7 @@ def get_primitives_paths():
]
}
- where the module `some_module` contains a variable such as::
+ where the module ``some_module`` contains a variable such as::
SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
@@ -159,8 +159,8 @@ def get_primitives_paths():
def get_pipelines_paths():
"""Get the list of folders where pipelines will be looked for.
- This list will include the value of any `entry_point` named `jsons_path` published under
- the `mlpipelines` name.
+ This list will include the value of any ``entry_point`` named ``jsons_path`` published under
+ the ``mlpipelines`` name.
An example of such an entry point would be::
@@ -170,7 +170,7 @@ def get_pipelines_paths():
]
}
- where the module `some_module` contains a variable such as::
+ where the module ``some_module`` contains a variable such as::
SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
@@ -190,7 +190,7 @@ def _load(name, paths):
Args:
name (str):
name of the JSON to look for. The name should not contain the
- `.json` extension, as it will be added dynamically.
+ ``.json`` extension, as it will be added dynamically.
Returns:
dict:
@@ -220,7 +220,7 @@ def load_primitive(name):
Args:
name (str):
name of the JSON to look for. The name should not contain the
- `.json` extension, as it will be added dynamically.
+ ``.json`` extension, as it will be added dynamically.
Returns:
dict:
@@ -228,7 +228,7 @@ def load_primitive(name):
Raises:
ValueError:
- A `ValueError` will be raised if the primitive cannot be found.
+ A ``ValueError`` will be raised if the primitive cannot be found.
"""
primitive = _load(name, get_primitives_paths())
if not primitive:
@@ -246,7 +246,7 @@ def load_pipeline(name):
Args:
name (str):
name of the JSON to look for. The name should not contain the
- `.json` extension, as it will be added dynamically.
+ ``.json`` extension, as it will be added dynamically.
Returns:
dict:
@@ -254,7 +254,7 @@ def load_pipeline(name):
Raises:
ValueError:
- A `ValueError` will be raised if the pipeline cannot be found.
+ A ``ValueError`` will be raised if the pipeline cannot be found.
"""
pipeline = _load(name, get_pipelines_paths())
if not pipeline:
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 1ab4a557..66bbf8fe 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -30,15 +30,16 @@ class MLBlock():
primitive (object):
the actual function or instance which this MLBlock wraps.
fit_args (dict):
- specification of the arguments expected by the `fit` method.
+ specification of the arguments expected by the ``fit`` method.
fit_method (str):
- name of the primitive method to call on `fit`. `None` if the primitive is a function.
+ name of the primitive method to call on ``fit``. ``None`` if the
+ primitive is a function.
produce_args (dict):
- specification of the arguments expected by the `predict` method.
+ specification of the arguments expected by the ``predict`` method.
produce_output (dict):
- specification of the outputs of the `produce` method.
+ specification of the outputs of the ``produce`` method.
produce_method (str):
- name of the primitive method to call on `produce`. `None` if the primitive is a
+ name of the primitive method to call on ``produce``. ``None`` if the primitive is a
function.
Args:
@@ -46,19 +47,19 @@ class MLBlock():
Name given to this MLBlock.
**kwargs:
Any additional arguments that will be used as hyperparameters or passed to the
- `fit` or `produce` methods.
+ ``fit`` or ``produce`` methods.
Raises:
TypeError:
- A `TypeError` is raised if a required argument is not found within the `kwargs`
+ A ``TypeError`` is raised if a required argument is not found within the ``kwargs``
or if an unexpected argument has been given.
""" # pylint: disable=too-many-instance-attributes
def _extract_params(self, kwargs, hyperparameters):
"""Extract init, fit and produce params from kwargs.
- The `init_params`, `fit_params` and `produce_params` are extracted
- from the passed `kwargs` taking the metadata hyperparameters as a
+ The ``init_params``, ``fit_params`` and ``produce_params`` are extracted
+ from the passed ``kwargs`` taking the metadata hyperparameters as a
reference.
During this extraction, make sure that all the required hyperparameters
@@ -66,15 +67,15 @@ def _extract_params(self, kwargs, hyperparameters):
Args:
kwargs (dict):
- dict containing the Keyword arguments that have been passed to the `__init__`
+ dict containing the Keyword arguments that have been passed to the ``__init__``
method upon initialization.
hyperparameters (dict):
hyperparameters dictionary, as found in the JSON annotation.
Raises:
TypeError:
- A `TypeError` is raised if a required argument is not found in the `kwargs` dict,
- or if an unexpected argument has been given.
+ A ``TypeError`` is raised if a required argument is not found in the
+ ``kwargs`` dict, or if an unexpected argument has been given.
"""
init_params = dict()
fit_params = dict()
@@ -262,7 +263,7 @@ def _get_method_kwargs(self, kwargs, method_args):
def fit(self, **kwargs):
"""Call the fit method of the primitive.
- The given keyword arguments will be passed directly to the `fit`
+ The given keyword arguments will be passed directly to the ``fit``
method of the primitive instance specified in the JSON annotation.
If any of the arguments expected by the produce method had been
@@ -277,7 +278,7 @@ def fit(self, **kwargs):
Raises:
TypeError:
- A `TypeError` might be raised if any argument not expected by the primitive fit
+ A ``TypeError`` might be raised if any argument not expected by the primitive fit
method is given.
"""
if self.fit_method is not None:
@@ -290,7 +291,7 @@ def produce(self, **kwargs):
"""Call the primitive function, or the predict method of the primitive.
The given keyword arguments will be passed directly to the primitive,
- if it is a simple function, or to the `produce` method of the
+ if it is a simple function, or to the ``produce`` method of the
primitive instance specified in the JSON annotation, if it is a class.
If any of the arguments expected by the fit method had been given
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index dc12b41f..b73d96b9 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -58,7 +58,7 @@ class MLPipeline():
init_params (dict):
dictionary containing initialization arguments to be passed when creating the
MLBlocks instances. The dictionary keys must be the corresponding primitive names
- and the values must be another dictionary that will be passed as `**kargs` to the
+ and the values must be another dictionary that will be passed as ``**kargs`` to the
MLBlock instance.
input_names (dict):
dictionary that maps input variable names with the actual names expected by each
@@ -191,7 +191,7 @@ def _get_block_args(self, block_name, block_args, context):
"""Get the arguments expected by the block method from the context.
The arguments will be taken from the context using both the method
- arguments specification and the `input_names` given when the pipeline
+ arguments specification and the ``input_names`` given when the pipeline
was created.
Args:
@@ -245,7 +245,7 @@ def _extract_outputs(self, block_name, outputs, block_outputs):
return output_dict
def _get_block_name(self, index):
- """Get the name of the block in the `index` position."""
+ """Get the name of the block in the ``index`` position."""
return list(self.blocks.keys())[index]
def _get_output_spec(self, output):
@@ -338,14 +338,14 @@ def _get_output(self, output_variable, context):
def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
"""Fit the blocks of this pipeline.
- Sequentially call the `fit` and the `produce` methods of each block,
- capturing the outputs each `produce` method before calling the `fit`
+ Sequentially call the ``fit`` and the ``produce`` methods of each block,
+ capturing the outputs each ``produce`` method before calling the ``fit``
method of the next one.
During the whole process a context dictionary is built, where both the
- passed arguments and the captured outputs of the `produce` methods
- are stored, and from which the arguments for the next `fit` and
- `produce` calls will be taken.
+ passed arguments and the captured outputs of the ``produce`` methods
+ are stored, and from which the arguments for the next ``fit`` and
+ ``produce`` calls will be taken.
Args:
X:
@@ -451,12 +451,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
def predict(self, X=None, output_=None, start_=None, **kwargs):
"""Produce predictions using the blocks of this pipeline.
- Sequentially call the `produce` method of each block, capturing the
+ Sequentially call the ``produce`` method of each block, capturing the
outputs before calling the next one.
During the whole process a context dictionary is built, where both the
- passed arguments and the captured outputs of the `produce` methods
- are stored, and from which the arguments for the next `produce` calls
+ passed arguments and the captured outputs of the ``produce`` methods
+ are stored, and from which the arguments for the next ``produce`` calls
will be taken.
Args:
@@ -550,7 +550,7 @@ def predict(self, X=None, output_=None, start_=None, **kwargs):
def to_dict(self):
"""Return all the details of this MLPipeline in a dict.
- The dict structure contains all the `__init__` arguments of the
+ The dict structure contains all the ``__init__`` arguments of the
MLPipeline, as well as the current hyperparameter values and the
specification of the tunable_hyperparameters::
@@ -599,7 +599,7 @@ def to_dict(self):
def save(self, path):
"""Save the specification of this MLPipeline in a JSON file.
- The content of the JSON file is the dict returned by the `to_dict` method.
+ The content of the JSON file is the dict returned by the ``to_dict`` method.
Args:
path (str):
@@ -612,7 +612,7 @@ def save(self, path):
def from_dict(cls, metadata):
"""Create a new MLPipeline from a dict specification.
- The dict structure is the same as the one created by the `to_dict` method.
+ The dict structure is the same as the one created by the ``to_dict`` method.
Args:
metadata (dict):
@@ -623,29 +623,30 @@ def from_dict(cls, metadata):
A new MLPipeline instance with the details found in the
given specification dictionary.
"""
- hyperparameters = metadata.get('hyperparameters')
- tunable = metadata.get('tunable_hyperparameters')
+ # hyperparameters = metadata.get('hyperparameters')
+ # tunable = metadata.get('tunable_hyperparameters')
- pipeline = cls(
- metadata['primitives'],
- metadata.get('init_params'),
- metadata.get('input_names'),
- metadata.get('output_names'),
- )
+ # pipeline = cls(
+ # metadata['primitives'],
+ # metadata.get('init_params'),
+ # metadata.get('input_names'),
+ # metadata.get('output_names'),
+ # )
- if hyperparameters:
- pipeline.set_hyperparameters(hyperparameters)
+ # if hyperparameters:
+ # pipeline.set_hyperparameters(hyperparameters)
- if tunable is not None:
- pipeline._tunable_hyperparameters = tunable
+ # if tunable is not None:
+ # pipeline._tunable_hyperparameters = tunable
- return pipeline
+ # return pipeline
+ return cls(metadata)
@classmethod
def load(cls, path):
"""Create a new MLPipeline from a JSON specification.
- The JSON file format is the same as the one created by the `to_dict` method.
+ The JSON file format is the same as the one created by the ``to_dict`` method.
Args:
path (str):
diff --git a/tests/features/test_pipeline_loading.py b/tests/features/test_pipeline_loading.py
index bc344d63..4b363d07 100644
--- a/tests/features/test_pipeline_loading.py
+++ b/tests/features/test_pipeline_loading.py
@@ -1,5 +1,4 @@
from unittest import TestCase
-from unittest.mock import Mock
from mlblocks import MLPipeline
From d97ad54e547665488fd2dcea21ec8369d95fcb7f Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 21 May 2019 13:56:51 -0400
Subject: [PATCH 024/160] Update the readme to the latest API changes
---
README.md | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index fb8d3885..cd454b73 100644
--- a/README.md
+++ b/README.md
@@ -81,10 +81,10 @@ them to the `MLPipeline` class.
>>> pipeline = MLPipeline(primitives)
```
-Optionally, specific hyperparameters can be also set by specifying them in a dictionary:
+Optionally, specific initialization arguments can be also set by specifying them in a dictionary:
```python
->>> hyperparameters = {
+>>> init_params = {
... 'skimage.feature.hog': {
... 'multichannel': True,
... 'visualize': False
@@ -93,7 +93,7 @@ Optionally, specific hyperparameters can be also set by specifying them in a dic
... 'n_estimators': 100,
... }
... }
->>> pipeline = MLPipeline(primitives, hyperparameters)
+>>> pipeline = MLPipeline(primitives, init_params=init_params)
```
If you can see which hyperparameters a particular pipeline is using, you can do so by calling
From 221cfb82ac9f6f7cd413043429916a5528567b0e Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 21 May 2019 16:24:00 -0400
Subject: [PATCH 025/160] Remove commented code
---
mlblocks/mlpipeline.py | 17 -----------------
1 file changed, 17 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index b73d96b9..ce31780f 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -623,23 +623,6 @@ def from_dict(cls, metadata):
A new MLPipeline instance with the details found in the
given specification dictionary.
"""
- # hyperparameters = metadata.get('hyperparameters')
- # tunable = metadata.get('tunable_hyperparameters')
-
- # pipeline = cls(
- # metadata['primitives'],
- # metadata.get('init_params'),
- # metadata.get('input_names'),
- # metadata.get('output_names'),
- # )
-
- # if hyperparameters:
- # pipeline.set_hyperparameters(hyperparameters)
-
- # if tunable is not None:
- # pipeline._tunable_hyperparameters = tunable
-
- # return pipeline
return cls(metadata)
@classmethod
From 197c47f3cc7bbe6e683a971866bbd9e52b9821d9 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 21 May 2019 17:05:49 -0400
Subject: [PATCH 026/160] Add instructions to install MLPrimitives
---
README.md | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/README.md b/README.md
index cd454b73..01629dc8 100644
--- a/README.md
+++ b/README.md
@@ -58,11 +58,26 @@ make install
For development, you can use `make install-develop` instead in order to install all
the required dependencies for testing and code linting.
+## MLPrimitives
+
+In order to be usable, MLBlocks requires a compatible primitives library.
+
+The official library, required in order to follow the following MLBlocks tutorial,
+is [MLPrimitives](https://github.com/HDI-Project/MLPrimitives), which you can install
+with this command:
+
+```bash
+pip install mlprimitives
+```
+
# Usage Example
Below there is a short example about how to use MLBlocks to create a simple pipeline, fit it
using demo data and use it to make predictions.
+Please make sure to having installed [MLPrimitives](https://github.com/HDI-Project/MLPrimitives)
+before following it.
+
For advance usage and more detailed explanation about each component, please have a look
at the [documentation](https://HDI-Project.github.io/MLBlocks)
From d451c7c3d2f9eb4972f8a1c38edbb468410b7d44 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 22 May 2019 15:12:57 -0400
Subject: [PATCH 027/160] Address PR feedback
---
docs/getting_started/quickstart.rst | 7 ++++---
mlblocks/discovery.py | 9 +++++++++
mlblocks/mlpipeline.py | 11 +++++------
3 files changed, 18 insertions(+), 9 deletions(-)
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
index c3edf475..2887da05 100644
--- a/docs/getting_started/quickstart.rst
+++ b/docs/getting_started/quickstart.rst
@@ -29,16 +29,17 @@ them to the `MLPipeline class`_:
]
pipeline = MLPipeline(primitives)
-Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary:
+Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary and
+passing them as the ``init_params`` argument:
.. ipython:: python
- hyperparameters = {
+ init_params = {
'sklearn.ensemble.RandomForestClassifier': {
'n_estimators': 100
}
}
- pipeline = MLPipeline(primitives, init_params=hyperparameters)
+ pipeline = MLPipeline(primitives, init_params=init_params)
Once the pipeline has been instantiated, we can easily see what `hyperparameters`_ have been set
for each block, by calling the `get_hyperparameters method`_.
diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index 1f952b81..51ff13cd 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -39,11 +39,16 @@ def _add_lookup_path(path, paths):
Args:
path (str):
path to add
+ paths (list):
+ list where the new path will be added.
Raises:
ValueError:
A ``ValueError`` will be raised if the path is not valid.
+ Returns:
+ bool:
+ Whether the new path was added or not.
"""
if path not in paths:
if not os.path.isdir(path):
@@ -52,6 +57,8 @@ def _add_lookup_path(path, paths):
paths.insert(0, os.path.abspath(path))
return True
+ return False
+
def add_primitives_path(path):
"""Add a new path to look for primitives.
@@ -191,6 +198,8 @@ def _load(name, paths):
name (str):
name of the JSON to look for. The name should not contain the
``.json`` extension, as it will be added dynamically.
+ paths (list):
+ list of paths where the primitives will be looked for.
Returns:
dict:
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index ce31780f..b31502ea 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -80,19 +80,18 @@ def _get_tunable_hyperparameters(self):
return tunable
- @staticmethod
- def _build_blocks(primitives, init_params):
+ def _build_blocks(self):
blocks = OrderedDict()
block_names_count = Counter()
- for primitive in primitives:
+ for primitive in self.primitives:
try:
block_names_count.update([primitive])
block_count = block_names_count[primitive]
block_name = '{}#{}'.format(primitive, block_count)
- block_params = init_params.get(block_name, dict())
+ block_params = self.init_params.get(block_name, dict())
if not block_params:
- block_params = init_params.get(primitive, dict())
+ block_params = self.init_params.get(primitive, dict())
if block_params and block_count > 1:
LOGGER.warning(("Non-numbered init_params are being used "
"for more than one block %s."), primitive)
@@ -137,7 +136,7 @@ def __init__(self, pipeline=None, primitives=None, init_params=None,
self.primitives = primitives or pipeline['primitives']
self.init_params = init_params or pipeline.get('init_params', dict())
- self.blocks = self._build_blocks(self.primitives, self.init_params)
+ self.blocks = self._build_blocks()
self.input_names = input_names or pipeline.get('input_names', dict())
self.output_names = output_names or pipeline.get('output_names', dict())
From 8b2b7aaecd72637d9769bfb9ad94025f242e2872 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 22 May 2019 16:16:27 -0400
Subject: [PATCH 028/160] rename mlprimitives.jsons_path to mlblocks.primitives
and support multiple paths
---
mlblocks/discovery.py | 48 +++++++++++++++++++++++++------------------
1 file changed, 28 insertions(+), 20 deletions(-)
diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index 51ff13cd..b5ca840d 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -102,17 +102,17 @@ def add_pipelines_path(path):
LOGGER.debug('New pipelines path added: %s', path)
-def _get_lookup_paths(entry_point):
- """Get the list of folders where elements will be looked for.
+def _load_entry_points(entry_point_name, entry_point_group='mlblocks'):
+ """Get a list of folders from entry points.
- This list will include the value of any ``entry_point`` named ``jsons_path`` published under
- the entry_point name.
+ This list will include the value of any entry point named after the given
+ ``entry_point_name`` published under the given ``entry_point_group``.
An example of such an entry point would be::
entry_points = {
- 'mlprimitives': [
- 'jsons_path=some_module:SOME_VARIABLE'
+ 'mlblocks': [
+ 'primitives=some_module:SOME_VARIABLE'
]
}
@@ -129,11 +129,14 @@ def _get_lookup_paths(entry_point):
The list of folders.
"""
lookup_paths = list()
- entry_points = pkg_resources.iter_entry_points(entry_point)
+ entry_points = pkg_resources.iter_entry_points(entry_point_group)
for entry_point in entry_points:
- if entry_point.name == 'jsons_path':
- path = entry_point.load()
- lookup_paths.append(path)
+ if entry_point.name == entry_point_name:
+ paths = entry_point.load()
+ if isinstance(paths, str):
+ lookup_paths.append(paths)
+ elif isinstance(paths, (list, tuple)):
+ lookup_paths.extend(paths)
return lookup_paths
@@ -141,14 +144,18 @@ def _get_lookup_paths(entry_point):
def get_primitives_paths():
"""Get the list of folders where primitives will be looked for.
- This list will include the value of any ``entry_point`` named ``jsons_path`` published under
- the ``mlprimitives`` name.
+ This list will include the values of all the entry points named ``primitives``
+ published under the entry point group ``mlblocks``.
+
+ Also, for backwards compatibility reasons, the paths from the entry points
+ named ``jsons_path`` published under the ``mlprimitives`` group will also
+ be included.
An example of such an entry point would be::
entry_points = {
- 'mlprimitives': [
- 'jsons_path=some_module:SOME_VARIABLE'
+ 'mlblocks': [
+ 'primitives=some_module:SOME_VARIABLE'
]
}
@@ -160,20 +167,21 @@ def get_primitives_paths():
list:
The list of folders.
"""
- return _PRIMITIVES_PATHS + _get_lookup_paths('mlprimitives')
+ paths = _load_entry_points('primitives') + _load_entry_points('jsons_path', 'mlprimitives')
+ return _PRIMITIVES_PATHS + paths
def get_pipelines_paths():
"""Get the list of folders where pipelines will be looked for.
- This list will include the value of any ``entry_point`` named ``jsons_path`` published under
- the ``mlpipelines`` name.
+ This list will include the values of all the entry points named ``pipelines``
+ published under the entry point group ``mlblocks``.
An example of such an entry point would be::
entry_points = {
- 'mlpipelines': [
- 'jsons_path=some_module:SOME_VARIABLE'
+ 'mlblocks': [
+ 'pipelines=some_module:SOME_VARIABLE'
]
}
@@ -185,7 +193,7 @@ def get_pipelines_paths():
list:
The list of folders.
"""
- return _PIPELINES_PATHS + _get_lookup_paths('mlpipelines')
+ return _PIPELINES_PATHS + _load_entry_points('pipelines')
def _load(name, paths):
From cc012b013b27f8f301eda8adc8edcc2a79a37c57 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 22 May 2019 16:16:41 -0400
Subject: [PATCH 029/160] Add unit tests for mlblocks discovery
---
tests/test_discovery.py | 151 ++++++++++++++++++++++++++++++++--------
1 file changed, 122 insertions(+), 29 deletions(-)
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index 3a7c3321..59bd4404 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -4,7 +4,7 @@
import os
import tempfile
import uuid
-from unittest.mock import patch
+from unittest.mock import call, patch
import pytest
from pkg_resources import Distribution, EntryPoint
@@ -14,92 +14,185 @@
FAKE_MLPRIMITIVES_PATH = 'this/is/a/fake'
-@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
-def test_add_primitives_path_do_nothing():
- discovery.add_primitives_path('a')
+def test__add_lookup_path_do_nothing():
+ paths = ['a', 'b']
+ discovery._add_lookup_path('a', paths)
- assert discovery._PRIMITIVES_PATHS == ['a', 'b']
+ assert paths == ['a', 'b']
-@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
-def test_add_primitives_path_exception():
+def test__add_lookup_path_exception():
+ paths = ['a', 'b']
invalid_path = str(uuid.uuid4())
with pytest.raises(ValueError):
- discovery.add_primitives_path(invalid_path)
+ discovery._add_lookup_path(invalid_path, paths)
+
+
+def test__add_lookup_path():
+ paths = ['a', 'b']
+ discovery._add_lookup_path('tests', paths)
+
+ expected_path = os.path.abspath('tests')
+
+ assert paths == [expected_path, 'a', 'b']
@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
def test_add_primitives_path():
- discovery.add_primitives_path('tests')
+ discovery.add_primitives_path(os.path.abspath('tests'))
expected_path = os.path.abspath('tests')
-
assert discovery._PRIMITIVES_PATHS == [expected_path, 'a', 'b']
+@patch('mlblocks.discovery._PIPELINES_PATHS', new=['a', 'b'])
+def test_add_pipelines_path():
+ discovery.add_pipelines_path('tests')
+
+ expected_path = os.path.abspath('tests')
+ assert discovery._PIPELINES_PATHS == [expected_path, 'a', 'b']
+
+
@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
@patch('mlblocks.discovery.pkg_resources.iter_entry_points')
-def test_get_primitives_paths_no_entry_points(iep_mock):
+def test__load_entry_points_no_entry_points(iep_mock):
# setup
iep_mock.return_value == []
# run
- paths = discovery.get_primitives_paths()
+ paths = discovery._load_entry_points('jsons_path', 'mlprimitives')
# assert
- assert paths == ['a', 'b']
- iep_mock.assert_called_once_with('mlprimitives')
+ assert paths == []
+ expected_calls = [
+ call('mlprimitives'),
+ ]
+ assert iep_mock.call_args_list == expected_calls
-@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
@patch('mlblocks.discovery.pkg_resources.iter_entry_points')
-def test_get_primitives_paths_entry_points(iep_mock):
+def test__load_entry_points_entry_points(iep_mock):
# setup
something_else_ep = EntryPoint('something_else', 'mlblocks.__version__')
- jsons_path_ep = EntryPoint(
- 'jsons_path',
+ primitives_ep = EntryPoint(
+ 'primitives',
'tests.test_discovery',
attrs=['FAKE_MLPRIMITIVES_PATH'],
dist=Distribution()
)
iep_mock.return_value = [
something_else_ep,
- jsons_path_ep
+ primitives_ep
]
# run
- paths = discovery.get_primitives_paths()
+ paths = discovery._load_entry_points('primitives')
# assert
expected = [
- 'a',
- 'b',
'this/is/a/fake'
]
assert paths == expected
- iep_mock.assert_called_once_with('mlprimitives')
+ expected_calls = [
+ call('mlblocks'),
+ ]
+ assert iep_mock.call_args_list == expected_calls
@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
-def test_load_primitive_value_error():
- with pytest.raises(ValueError):
- discovery.load_primitive('invalid.primitive')
+@patch('mlblocks.discovery._load_entry_points')
+def test_get_primitives_paths(lep_mock):
+ lep_mock.side_effect = [['c'], []]
+
+ paths = discovery.get_primitives_paths()
+
+ assert paths == ['a', 'b', 'c']
+ expected_calls = [
+ call('primitives'),
+ call('jsons_path', 'mlprimitives'),
+ ]
+ assert lep_mock.call_args_list == expected_calls
+
+
+@patch('mlblocks.discovery._PIPELINES_PATHS', new=['a', 'b'])
+@patch('mlblocks.discovery._load_entry_points')
+def test_get_pipelines_paths(lep_mock):
+ lep_mock.return_value = ['c']
+ paths = discovery.get_pipelines_paths()
-def test_load_primitive_success():
+ assert paths == ['a', 'b', 'c']
+ lep_mock.assert_called_once_with('pipelines')
+
+
+def test__load_value_error():
+ primitive = discovery._load('invalid.primitive', ['a', 'b'])
+
+ assert primitive is None
+
+
+def test__load_success():
primitive = {
'name': 'temp.primitive',
'primitive': 'temp.primitive'
}
with tempfile.TemporaryDirectory() as tempdir:
- discovery.add_primitives_path(tempdir)
+ paths = [tempdir]
primitive_path = os.path.join(tempdir, 'temp.primitive.json')
with open(primitive_path, 'w') as primitive_file:
json.dump(primitive, primitive_file, indent=4)
- loaded = discovery.load_primitive('temp.primitive')
+ loaded = discovery._load('temp.primitive', paths)
assert primitive == loaded
+
+
+@patch('mlblocks.discovery.get_primitives_paths')
+@patch('mlblocks.discovery._load')
+def test__load_primitive_value_error(load_mock, gpp_mock):
+ load_mock.return_value = None
+ gpp_mock.return_value = ['a', 'b']
+
+ with pytest.raises(ValueError):
+ discovery.load_primitive('invalid.primitive')
+
+ load_mock.assert_called_once_with('invalid.primitive', ['a', 'b'])
+
+
+@patch('mlblocks.discovery.get_primitives_paths')
+@patch('mlblocks.discovery._load')
+def test__load_primitive_success(load_mock, gpp_mock):
+ gpp_mock.return_value = ['a', 'b']
+
+ primitive = discovery.load_primitive('valid.primitive')
+
+ load_mock.assert_called_once_with('valid.primitive', ['a', 'b'])
+
+ assert primitive == load_mock.return_value
+
+
+@patch('mlblocks.discovery.get_pipelines_paths')
+@patch('mlblocks.discovery._load')
+def test__load_pipeline_value_error(load_mock, gpp_mock):
+ load_mock.return_value = None
+ gpp_mock.return_value = ['a', 'b']
+
+ with pytest.raises(ValueError):
+ discovery.load_pipeline('invalid.pipeline')
+
+ load_mock.assert_called_once_with('invalid.pipeline', ['a', 'b'])
+
+
+@patch('mlblocks.discovery.get_pipelines_paths')
+@patch('mlblocks.discovery._load')
+def test__load_pipeline_success(load_mock, gpp_mock):
+ gpp_mock.return_value = ['a', 'b']
+
+ pipeline = discovery.load_pipeline('valid.pipeline')
+
+ load_mock.assert_called_once_with('valid.pipeline', ['a', 'b'])
+
+ assert pipeline == load_mock.return_value
From e5de2532b0c83d27c72efc615bd8c680720a5f2d Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 22 May 2019 19:39:52 -0400
Subject: [PATCH 030/160] Update docs about primitives entry_points
---
docs/advanced_usage/adding_primitives.rst | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/docs/advanced_usage/adding_primitives.rst b/docs/advanced_usage/adding_primitives.rst
index e3d4b964..9d358629 100644
--- a/docs/advanced_usage/adding_primitives.rst
+++ b/docs/advanced_usage/adding_primitives.rst
@@ -91,20 +91,27 @@ In order to make **MLBLocks** able to find the primitives defined in such a libr
all you need to do is setting up an `Entry Point`_ in your `setup.py` script with the
following specification:
-1. It has to be published under the name ``mlprimitives``.
-2. It has to be named exactly ``jsons_path``.
-3. It has to point at a variable that contains the path to the JSONS folder.
+1. It has to be published under the group ``mlblocks``.
+2. It has to be named exactly ``primitives``.
+3. It has to point at a variable that contains a path or a list of paths to the JSONS folder(s).
An example of such an entry point would be::
entry_points = {
- 'mlprimitives': [
- 'jsons_path=some_module:SOME_VARIABLE'
+ 'mlblocks': [
+ 'primitives=some_module:SOME_VARIABLE'
]
}
where the module `some_module` contains a variable such as::
- SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
+ SOME_VARIABLE = 'path/to/primitives'
+
+or::
+
+ SOME_VARIABLE = [
+ 'path/to/primitives',
+ 'path/to/more/primitives'
+ ]
.. _Entry Point: https://packaging.python.org/specifications/entry-points/
From 67df52a740a2c4cfe2d0bad9de9168c70723634c Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 23 May 2019 18:11:44 -0400
Subject: [PATCH 031/160] Add functions to explore pipelines and primitives
---
mlblocks/discovery.py | 77 +++++++++++++++++++++++++++++++++++++----
mlblocks/mlblock.py | 12 ++++---
tests/test_discovery.py | 4 +--
3 files changed, 79 insertions(+), 14 deletions(-)
diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index b5ca840d..40853de9 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -11,6 +11,7 @@
import json
import logging
import os
+import re
import sys
import pkg_resources
@@ -23,6 +24,7 @@
os.path.join(os.getcwd(), 'mlblocks_primitives'), # legacy
os.path.join(sys.prefix, 'mlblocks_primitives'), # legacy
]
+
_PIPELINES_PATHS = [
os.path.join(os.getcwd(), 'mlpipelines'),
]
@@ -168,7 +170,7 @@ def get_primitives_paths():
The list of folders.
"""
paths = _load_entry_points('primitives') + _load_entry_points('jsons_path', 'mlprimitives')
- return _PRIMITIVES_PATHS + paths
+ return _PRIMITIVES_PATHS + list(set(paths))
def get_pipelines_paths():
@@ -228,6 +230,9 @@ def _load(name, paths):
return json.load(json_file)
+_PRIMITIVES = dict()
+
+
def load_primitive(name):
"""Locate and load the primitive JSON annotation.
@@ -247,13 +252,20 @@ def load_primitive(name):
ValueError:
A ``ValueError`` will be raised if the primitive cannot be found.
"""
- primitive = _load(name, get_primitives_paths())
- if not primitive:
- raise ValueError("Unknown primitive: {}".format(name))
+ primitive = _PRIMITIVES.get(name)
+ if primitive is None:
+ primitive = _load(name, get_primitives_paths())
+ if primitive is None:
+ raise ValueError("Unknown primitive: {}".format(name))
+
+ _PRIMITIVES[name] = primitive
return primitive
+_PIPELINES = dict()
+
+
def load_pipeline(name):
"""Locate and load the pipeline JSON annotation.
@@ -273,8 +285,59 @@ def load_pipeline(name):
ValueError:
A ``ValueError`` will be raised if the pipeline cannot be found.
"""
- pipeline = _load(name, get_pipelines_paths())
- if not pipeline:
- raise ValueError("Unknown pipeline: {}".format(name))
+ pipeline = _PIPELINES.get(name)
+ if pipeline is None:
+ pipeline = _load(name, get_pipelines_paths())
+ if pipeline is None:
+ raise ValueError("Unknown pipeline: {}".format(name))
+
+ _PIPELINES[name] = pipeline
return pipeline
+
+
+def _search_annotations(base_path, pattern, parts=None):
+ annotations = dict()
+ parts = parts or list()
+ if os.path.exists(base_path):
+ for name in os.listdir(base_path):
+ path = os.path.abspath(os.path.join(base_path, name))
+ if os.path.isdir(path):
+ annotations.update(_search_annotations(path, pattern, parts + [name]))
+ elif path not in annotations:
+ name = '.'.join(parts + [name])
+ if pattern.search(name) and name.endswith('.json'):
+ annotations[path] = name[:-5]
+
+ return annotations
+
+
+def _get_annotations_list(paths, loader, pattern, **metadata_filters):
+ pattern = re.compile(pattern)
+ annotations = dict()
+ for base_path in paths:
+ annotations.update(_search_annotations(base_path, pattern))
+
+ matching = list()
+ for name in sorted(annotations.values()):
+ annotation = loader(name)
+ metadata = annotation.get('metadata', dict())
+ for key, value in metadata_filters.items():
+ metadata_value = metadata.get(key, '')
+ if not re.search(value, metadata_value):
+ break
+
+ else:
+ matching.append(name)
+
+ return matching
+
+
+def get_primitives_list(pattern='', **metadata_filters):
+ return _get_annotations_list(
+ get_primitives_paths(), load_primitive, pattern, **metadata_filters)
+
+
+def get_pipelines_list(pattern='', **metadata_filters):
+ return _get_annotations_list(
+ get_pipelines_paths(), load_pipeline, pattern, **metadata_filters)
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 66bbf8fe..6370b4cf 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -27,6 +27,8 @@ class MLBlock():
Attributes:
name (str):
Name given to this MLBlock.
+ metadata (dict):
+ Additional information about this primitive
primitive (object):
the actual function or instance which this MLBlock wraps.
fit_args (dict):
@@ -143,22 +145,22 @@ def _get_tunable(cls, hyperparameters, init_params):
def __init__(self, name, **kwargs):
self.name = name
- metadata = load_primitive(name)
+ primitive = load_primitive(name)
- self.primitive = import_object(metadata['primitive'])
+ self.primitive = import_object(primitive['primitive'])
- self._fit = metadata.get('fit', dict())
+ self._fit = primitive.get('fit', dict())
self.fit_args = self._fit.get('args', [])
self.fit_method = self._fit.get('method')
- self._produce = metadata['produce']
+ self._produce = primitive['produce']
self.produce_args = self._produce['args']
self.produce_output = self._produce['output']
self.produce_method = self._produce.get('method')
self._class = bool(self.produce_method)
- hyperparameters = metadata.get('hyperparameters', dict())
+ hyperparameters = primitive.get('hyperparameters', dict())
init_params, fit_params, produce_params = self._extract_params(kwargs, hyperparameters)
self._hyperparameters = init_params
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index 59bd4404..3681611b 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -11,7 +11,7 @@
from mlblocks import discovery
-FAKE_MLPRIMITIVES_PATH = 'this/is/a/fake'
+FAKE_PRIMITIVES_PATH = 'this/is/a/fake'
def test__add_lookup_path_do_nothing():
@@ -78,7 +78,7 @@ def test__load_entry_points_entry_points(iep_mock):
primitives_ep = EntryPoint(
'primitives',
'tests.test_discovery',
- attrs=['FAKE_MLPRIMITIVES_PATH'],
+ attrs=['FAKE_PRIMITIVES_PATH'],
dist=Distribution()
)
iep_mock.return_value = [
From 467948e4088915eabbe2e6853e2d88408a10e96d Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 4 Jul 2019 17:57:48 -0400
Subject: [PATCH 032/160] Add support to work with hyperparameters in the
format used by BTB
---
mlblocks/mlpipeline.py | 103 +++++++++++++++++++++++++++++++++++++++--
1 file changed, 100 insertions(+), 3 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index b31502ea..3c08f444 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -4,7 +4,9 @@
import json
import logging
-from collections import Counter, OrderedDict
+from collections import Counter, OrderedDict, defaultdict
+
+import numpy as np
from mlblocks.discovery import load_pipeline
from mlblocks.mlblock import MLBlock
@@ -161,18 +163,112 @@ def get_tunable_hyperparameters(self):
"""
return self._tunable_hyperparameters.copy()
- def get_hyperparameters(self):
+ @classmethod
+ def _sanitize_value(cls, value):
+ """Convert numpy values to their python primitive type equivalent.
+
+ If a value is a dict, recursively sanitize its values.
+
+ Args:
+ value:
+ value to sanitize.
+
+ Returns:
+ sanitized value.
+ """
+ if isinstance(value, dict):
+ return {
+ key: cls._sanitize_value(value)
+ for key, value in value.items()
+ }
+ if isinstance(value, np.integer):
+ return int(value)
+ elif isinstance(value, np.floating):
+ return float(value)
+ elif isinstance(value, np.ndarray):
+ return value.tolist()
+ elif isinstance(value, np.bool_):
+ return bool(value)
+ elif value == 'None':
+ return None
+
+ return value
+
+ @classmethod
+ def _sanitize(cls, hyperparameters):
+ """Convert tuple hyperparameter keys to nested dicts.
+
+ Also convert numpy types to primary python types.
+
+ The input hyperparameters dict can specify them in two formats:
+
+ One is the native MLBlocks format, where each key is the name of a block and each value
+ is a dict containing a complete hyperparameter specification for that block::
+
+ {
+ "block_name": {
+ "hyperparameter_name": "hyperparameter_value",
+ ...
+ },
+ ...
+ }
+
+ The other one is an alternative format where each key is a two element tuple containing
+ the name of the block as the first element and the name of the hyperparameter as the
+ second one::
+
+ {
+ ("block_name", "hyperparameter_name"): "hyperparameter_value",
+ ...
+ }
+
+
+ Args:
+ hyperparaeters (dict):
+ hyperparameters dict to sanitize.
+
+ Returns:
+ dict:
+ Sanitized dict.
+ """
+ params_tree = defaultdict(dict)
+ for key, value in hyperparameters.items():
+ value = cls._sanitize_value(value)
+ if isinstance(key, tuple):
+ block, hyperparameter = key
+ params_tree[block][hyperparameter] = value
+ else:
+ params_tree[key] = value
+
+ return params_tree
+
+ def get_hyperparameters(self, flat=False):
"""Get the current hyperparamters of each block.
+ Args:
+ flat (bool): If True, return a flattened dictionary where each key
+ is a two elements tuple containing the name of the block as the first
+ element and the name of the hyperparameter as the second one.
+ If False (default), return a dictionary where each key is the name of
+ a block and each value is a dictionary containing the complete
+ hyperparameter specification of that block.
+
Returns:
dict:
A dictionary containing the block names as keys and
the current block hyperparameters dictionary as values.
"""
- hyperparameters = {}
+ hyperparameters = dict()
for block_name, block in self.blocks.items():
hyperparameters[block_name] = block.get_hyperparameters()
+ if flat:
+ hyperparameters = {
+ (block, name): value
+ for block, block_hyperparameters in hyperparameters.items()
+ for name, value in block_hyperparameters.items()
+ }
+
return hyperparameters
def set_hyperparameters(self, hyperparameters):
@@ -183,6 +279,7 @@ def set_hyperparameters(self, hyperparameters):
A dictionary containing the block names as keys and the new hyperparameters
dictionary as values.
"""
+ hyperparameters = self._sanitize(hyperparameters)
for block_name, block_hyperparams in hyperparameters.items():
self.blocks[block_name].set_hyperparameters(block_hyperparams)
From 4a91c74badac8da64c02b2f25e935a734e71dcc0 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Fri, 5 Jul 2019 19:55:19 -0400
Subject: [PATCH 033/160] return flat tunables and add tests
---
mlblocks/mlpipeline.py | 30 +++++++---
tests/test_mlpipeline.py | 117 ++++++++++++++++++++++++++++++++++++++-
2 files changed, 138 insertions(+), 9 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 3c08f444..36b71b29 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -153,15 +153,35 @@ def __init__(self, pipeline=None, primitives=None, init_params=None,
if hyperparameters:
self.set_hyperparameters(hyperparameters)
- def get_tunable_hyperparameters(self):
+ @staticmethod
+ def _flatten_dict(hyperparameters):
+ return {
+ (block, name): value
+ for block, block_hyperparameters in hyperparameters.items()
+ for name, value in block_hyperparameters.items()
+ }
+
+ def get_tunable_hyperparameters(self, flat=False):
"""Get the tunable hyperparamters of each block.
+ Args:
+ flat (bool): If True, return a flattened dictionary where each key
+ is a two elements tuple containing the name of the block as the first
+ element and the name of the hyperparameter as the second one.
+ If False (default), return a dictionary where each key is the name of
+ a block and each value is a dictionary containing the complete
+ hyperparameter specification of that block.
+
Returns:
dict:
A dictionary containing the block names as keys and
the block tunable hyperparameters dictionary as values.
"""
- return self._tunable_hyperparameters.copy()
+ tunables = self._tunable_hyperparameters.copy()
+ if flat:
+ tunables = self._flatten_dict(tunables)
+
+ return tunables
@classmethod
def _sanitize_value(cls, value):
@@ -263,11 +283,7 @@ def get_hyperparameters(self, flat=False):
hyperparameters[block_name] = block.get_hyperparameters()
if flat:
- hyperparameters = {
- (block, name): value
- for block, block_hyperparameters in hyperparameters.items()
- for name, value in block_hyperparameters.items()
- }
+ hyperparameters = self._flatten_dict(hyperparameters)
return hyperparameters
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 741be194..906c2c61 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -85,9 +85,72 @@ def test_get_tunable_hyperparameters(self):
assert returned == tunable
assert returned is not tunable
+ def test_get_tunable_hyperparameters_flat(self):
+ mlpipeline = MLPipeline(list())
+ tunable = {
+ 'block_1': {
+ 'hp_1': {
+ 'type': 'int',
+ 'range': [
+ 1,
+ 10
+ ],
+ }
+ },
+ 'block_2': {
+ 'hp_1': {
+ 'type': 'str',
+ 'default': 'a',
+ 'values': [
+ 'a',
+ 'b',
+ 'c'
+ ],
+ },
+ 'hp_2': {
+ 'type': 'bool',
+ 'default': True,
+ }
+ }
+ }
+ mlpipeline._tunable_hyperparameters = tunable
+
+ returned = mlpipeline.get_tunable_hyperparameters(flat=True)
+
+ expected = {
+ ('block_1', 'hp_1'): {
+ 'type': 'int',
+ 'range': [
+ 1,
+ 10
+ ],
+ },
+ ('block_2', 'hp_1'): {
+ 'type': 'str',
+ 'default': 'a',
+ 'values': [
+ 'a',
+ 'b',
+ 'c'
+ ],
+ },
+ ('block_2', 'hp_2'): {
+ 'type': 'bool',
+ 'default': True,
+ }
+ }
+ assert returned == expected
+
def test_get_hyperparameters(self):
block_1 = Mock()
+ block_1.get_hyperparameters.return_value = {
+ 'a': 'a'
+ }
block_2 = Mock()
+ block_2.get_hyperparameters.return_value = {
+ 'b': 'b',
+ 'c': 'c',
+ }
blocks = OrderedDict((
('a.primitive.Name#1', block_1),
('a.primitive.Name#2', block_2),
@@ -98,8 +161,40 @@ def test_get_hyperparameters(self):
hyperparameters = mlpipeline.get_hyperparameters()
assert hyperparameters == {
- 'a.primitive.Name#1': block_1.get_hyperparameters.return_value,
- 'a.primitive.Name#2': block_2.get_hyperparameters.return_value,
+ 'a.primitive.Name#1': {
+ 'a': 'a',
+ },
+ 'a.primitive.Name#2': {
+ 'b': 'b',
+ 'c': 'c',
+ },
+ }
+ block_1.get_hyperparameters.assert_called_once_with()
+ block_2.get_hyperparameters.assert_called_once_with()
+
+ def test_get_hyperparameters_flat(self):
+ block_1 = Mock()
+ block_1.get_hyperparameters.return_value = {
+ 'a': 'a'
+ }
+ block_2 = Mock()
+ block_2.get_hyperparameters.return_value = {
+ 'b': 'b',
+ 'c': 'c',
+ }
+ blocks = OrderedDict((
+ ('a.primitive.Name#1', block_1),
+ ('a.primitive.Name#2', block_2),
+ ))
+ mlpipeline = MLPipeline(list())
+ mlpipeline.blocks = blocks
+
+ hyperparameters = mlpipeline.get_hyperparameters(flat=True)
+
+ assert hyperparameters == {
+ ('a.primitive.Name#1', 'a'): 'a',
+ ('a.primitive.Name#2', 'b'): 'b',
+ ('a.primitive.Name#2', 'c'): 'c',
}
block_1.get_hyperparameters.assert_called_once_with()
block_2.get_hyperparameters.assert_called_once_with()
@@ -124,6 +219,24 @@ def test_set_hyperparameters(self):
block_1.set_hyperparameters.assert_not_called()
block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'})
+ def test_set_hyperparameters_flat(self):
+ block_1 = Mock()
+ block_2 = Mock()
+ blocks = OrderedDict((
+ ('a.primitive.Name#1', block_1),
+ ('a.primitive.Name#2', block_2),
+ ))
+ mlpipeline = MLPipeline(list())
+ mlpipeline.blocks = blocks
+
+ hyperparameters = {
+ ('a.primitive.Name#2', 'some'): 'arg'
+ }
+ mlpipeline.set_hyperparameters(hyperparameters)
+
+ block_1.set_hyperparameters.assert_not_called()
+ block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'})
+
def test__get_block_args(self):
pass
From 5bd5a709f853b06b564bd607ee904ea9f95269c9 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Fri, 5 Jul 2019 20:17:45 -0400
Subject: [PATCH 034/160] Fix setuptools version to fix dependency issues on
tests
---
setup.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/setup.py b/setup.py
index 9fca4dfa..9c7b3d2e 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,8 @@
'pytest>=3.4.2',
'pytest-cov>=2.6.0',
'mlprimitives>=0.1.3,<0.2',
- 'urllib3>=1.20,<1.25'
+ 'urllib3>=1.20,<1.25',
+ 'setuptools>=41.0.0'
]
From 4dcf6022a78ca7230c7c0f714bd7185fdc4dd195 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 8 Jul 2019 13:29:25 -0400
Subject: [PATCH 035/160] Add docs for intermediate outputs
---
docs/advanced_usage/pipelines.rst | 82 ++++++++++++++++++++++++++++++-
1 file changed, 81 insertions(+), 1 deletion(-)
diff --git a/docs/advanced_usage/pipelines.rst b/docs/advanced_usage/pipelines.rst
index 33d57cdc..e87a0067 100644
--- a/docs/advanced_usage/pipelines.rst
+++ b/docs/advanced_usage/pipelines.rst
@@ -271,7 +271,7 @@ Like primitives, Pipelines can also be annotated and stored as dicts or JSON fil
the different arguments expected by the ``MLPipeline`` class, as well as the set hyperparameters
and tunable hyperparameters.
-Representing a Pipeline as a dict
+Representing a Pipeline as a dict
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The dict representation of an Pipeline can be obtained directly from an ``MLPipeline`` instance,
@@ -344,6 +344,86 @@ that allows loading the pipeline directly from a JSON file:
pipeline = MLPipeline.load('pipeline.json')
+
+Intermediate Outputs and Partial Execution
+------------------------------------------
+
+Sometimes we might be interested in capturing an intermediate output within a
+pipeline execution in order to inspect it, for debugging purposes, or to reuse
+it later on in order to speed up a tuning process where the pipeline needs
+to be executed multiple times over the same data.
+
+For this, two special arguments have been included in the ``fit`` and ``predict``
+methods of an MLPipeline:
+
+output\_
+~~~~~~~~
+
+The ``output_`` argument indicates which block within the pipeline we are interested
+in taking the output values from. This, implicitly, indicates up to which block the
+pipeline needs to be executed within ``fit`` and ``predict`` before returning.
+
+The ``output_`` argument is optional, and it can either be ``None``, which is the default,
+and Integer or a String.
+
+And its format is as follows:
+
+* If it is ``None`` (default), the ``fit`` method will return nothing and the
+ ``predict`` method will return the output of the last block in the pipeline.
+* If an integer is given, it is interpreted as the block index, starting on 0,
+ and the whole context after executing the specified block will be returned.
+ In case of ``fit``, this means that the outputs will be returned after fitting
+ a block and then producing it on the same data.
+* If it is a string, it can be interpreted in three ways:
+
+ * **block name**: If the string matches a block name exactly, including
+ its hash and counter number ``#n`` at the end, the whole context will be
+ returned after that block is produced.
+ * **variable_name**: If the string does not match any block name and does
+ not contain any dot character, ``'.'``, it will be considered a variable
+ name. In this case, the indicated variable will be extracted from the
+ context and returned after the last block has been produced.
+ * **block_name + variable_name**: If the complete string does not match a
+ block name but it contains at least one dot, ``'.'``, it will be split
+ in two parts on the last dot. If the first part of the string matches a
+ block name exactly, the second part of the string will be considered a
+ variable name, assuming the format ``{block_name}.{variable_name}``, and
+ the indicated variable will be extracted from the context and returned
+ after the block has been produced. Otherwise, if the extracted
+ ``block_name`` does not match a block name exactly, a ``ValueError``
+ will be raised.
+
+start\_
+~~~~~~~
+
+The ``start_`` argument indicates which block within the pipeline we are interested
+in starting the computation from when executing ``fit`` and ``predict``, allowing us
+to skip some of the initial blocks.
+
+The ``start_`` argument is optional, and it can either be ``None``, which is the default,
+and Integer or a String.
+
+And its format is as follows:
+
+* If it is ``None``, the execution will start on the first block.
+* If it is an integer, it is interpreted as the block index
+* If it is a string, it is expected to be the name of the block, including the counter
+ number at the end.
+
+This is specially useful when used in combination with the ``output_`` argument, as it
+effectively allows us to both capture intermediate outputs for debugging purposes or
+reusing intermediate states of the pipeline to accelerate tuning processes.
+
+An example of this situation, where we want to reuse the output of the first block, could be::
+
+ context_0 = pipeline.fit(X_train, y_train, output_=0)
+
+ # Afterwards, within the tuning loop
+ pipeline.fit(start_=1, **context_0)
+ predictions = pipeline.predict(X_test)
+ score = compute_score(y_test, predictions)
+
+
.. _API Reference: ../api_reference.html
.. _primitives: ../primitives.html
.. _mlblocks.MLPipeline: ../api_reference.html#mlblocks.MLPipeline
From f93c8b155e6c17cc589bac2a6364e0db7443927d Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 8 Jul 2019 14:37:57 -0400
Subject: [PATCH 036/160] Add release notes for v0.3.1
---
HISTORY.md | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/HISTORY.md b/HISTORY.md
index a312c9cb..e6b14674 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,20 @@
Changelog
=========
+0.3.1 - Pipelines Discovery
+---------------------------
+
+* Support flat hyperparameter dictionaries
+ [Issue #92](https://github.com/HDI-Project/MLBlocks/issues/92) by @csala
+* Load pipelines by name and register them as `entry_points`
+ [Issue #88](https://github.com/HDI-Project/MLBlocks/issues/88) by @csala
+* Implement partial re-fit
+ [Issue #61](https://github.com/HDI-Project/MLBlocks/issues/61) by @csala
+* Move argument parsing to MLBlock
+ [Issue #86](https://github.com/HDI-Project/MLBlocks/issues/86) by @csala
+* Allow getting intermediate outputs
+ [Issue #58](https://github.com/HDI-Project/MLBlocks/issues/58) by @csala
+
0.3.0 - New Primitives Discovery
--------------------------------
From 0d3ba9245e93a83f6a5d674e4cf84917ec3f898b Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 8 Jul 2019 14:38:04 -0400
Subject: [PATCH 037/160] =?UTF-8?q?Bump=20version:=200.3.1-dev=20=E2=86=92?=
=?UTF-8?q?=200.3.1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 7 ++++---
setup.py | 2 +-
3 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 37199013..b47c8962 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.3.1-dev'
+__version__ = '0.3.1'
__all__ = [
'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 17244565..d4103297 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,15 +1,15 @@
[bumpversion]
-current_version = 0.3.1-dev
+current_version = 0.3.1
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))?
-serialize =
+serialize =
{major}.{minor}.{patch}-{release}
{major}.{minor}.{patch}
[bumpversion:part:release]
optional_value = release
-values =
+values =
dev
release
@@ -51,3 +51,4 @@ max-line-length = 99
[pydocstyle]
add-ignore = D403,D413,D105,D107
+
diff --git a/setup.py b/setup.py
index 9c7b3d2e..3f01d72e 100644
--- a/setup.py
+++ b/setup.py
@@ -98,6 +98,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/HDI-Project/MLBlocks',
- version='0.3.1-dev',
+ version='0.3.1',
zip_safe=False,
)
From 28a9a44373d10cd0b8e41ead686889535a4b7269 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 8 Jul 2019 14:38:21 -0400
Subject: [PATCH 038/160] =?UTF-8?q?Bump=20version:=200.3.1=20=E2=86=92=200?=
=?UTF-8?q?.3.2-dev?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index b47c8962..b528aefe 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.3.1'
+__version__ = '0.3.2-dev'
__all__ = [
'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index d4103297..1967b27b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.3.1
+current_version = 0.3.2-dev
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))?
diff --git a/setup.py b/setup.py
index 3f01d72e..98350606 100644
--- a/setup.py
+++ b/setup.py
@@ -98,6 +98,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/HDI-Project/MLBlocks',
- version='0.3.1',
+ version='0.3.2-dev',
zip_safe=False,
)
From 677ef256ef5e23c4abfe52b8b5a2f839bf5cdf1d Mon Sep 17 00:00:00 2001
From: Kalyan Veeramachaneni
Date: Sun, 14 Jul 2019 19:01:25 -0700
Subject: [PATCH 039/160] Update README.md
---
README.md | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index 01629dc8..5b4f2519 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,16 @@
-
-
+
+
+An open source project from Data to AI Lab at MIT.
-
-
+
+
+
+
+
+
+
Pipelines and Primitives for Machine Learning and Data Science.
-
[![PyPi][pypi-img]][pypi-url]
From 98b4d245c5cefc68f1ce3d1a7217f961dfe3378c Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 17 Jul 2019 14:29:07 +0200
Subject: [PATCH 040/160] Isolate block hyperparams from primitives
---
mlblocks/mlblock.py | 9 +++++----
tests/test_mlblock.py | 46 +++++++++++++++++++++++++++++++++----------
2 files changed, 41 insertions(+), 14 deletions(-)
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 66bbf8fe..fa67bd6b 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -4,6 +4,7 @@
import importlib
import logging
+from copy import deepcopy
from mlblocks.discovery import load_primitive
@@ -192,7 +193,7 @@ def get_tunable_hyperparameters(self):
tuned, their types and, if applicable, the accepted
ranges or values.
"""
- return self._tunable.copy()
+ return deepcopy(self._tunable)
def get_hyperparameters(self):
"""Get hyperparameters values that the current MLBlock is using.
@@ -202,7 +203,7 @@ def get_hyperparameters(self):
the dictionary containing the hyperparameter values that the
MLBlock is currently using.
"""
- return self._hyperparameters.copy()
+ return deepcopy(self._hyperparameters)
def set_hyperparameters(self, hyperparameters):
"""Set new hyperparameters.
@@ -221,7 +222,7 @@ def set_hyperparameters(self, hyperparameters):
if self._class:
LOGGER.debug('Creating a new primitive instance for %s', self.name)
- self.instance = self.primitive(**self._hyperparameters)
+ self.instance = self.primitive(**self.get_hyperparameters())
def _get_method_kwargs(self, kwargs, method_args):
"""Prepare the kwargs for the method.
@@ -307,5 +308,5 @@ def produce(self, **kwargs):
if self._class:
return getattr(self.instance, self.produce_method)(**produce_kwargs)
- produce_kwargs.update(self._hyperparameters)
+ produce_kwargs.update(self.get_hyperparameters())
return self.primitive(**produce_kwargs)
diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py
index 5273d40c..16f1c6d1 100644
--- a/tests/test_mlblock.py
+++ b/tests/test_mlblock.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
from unittest import TestCase
-from unittest.mock import patch
+from unittest.mock import MagicMock, Mock, patch
from mlblocks.mlblock import MLBlock, import_object
@@ -403,27 +403,53 @@ def test_get_tunable_hyperparameters(self, load_primitive_mock, import_object_mo
assert returned == tunable
assert returned is not tunable
+ @patch('mlblocks.mlblock.import_object', new=Mock())
+ @patch('mlblocks.mlblock.load_primitive', new=MagicMock())
+ def test_get_hyperparameters(self):
+ """get_hyperparameters has to return a deepcopy of the _hyperparameters attribute."""
+ mlblock = MLBlock('given_primitive_name')
+
+ hyperparameters = {
+ 'a_list_param': ['a']
+ }
+ mlblock._hyperparameters = hyperparameters
+
+ returned = mlblock.get_hyperparameters()
+
+ assert returned == hyperparameters
+ assert returned is not hyperparameters
+
+ returned['a_list_param'].append('b')
+ assert 'b' not in hyperparameters['a_list_param']
+
@patch('mlblocks.mlblock.import_object')
@patch('mlblocks.mlblock.load_primitive')
- def test_get_hyperparameters(self, load_primitive_mock, import_object_mock):
- """get_hyperparameters has to return a copy of the _hyperparameters attribute."""
- load_primitive_mock.return_value = {
- 'primitive': 'a_primitive_name',
+ def test_modify_hyperparameters(self, lp_mock, io_mock):
+ """If a primitive method modifies the hyperparameters, changes should not persist."""
+
+ def primitive(a_list_param):
+ a_list_param.append('b')
+
+ io_mock.return_value = primitive
+
+ lp_mock.return_value = {
+ 'primitive': 'a_primitive',
'produce': {
'args': [],
'output': []
}
}
- mlblock = MLBlock('given_primitive_name')
+ mlblock = MLBlock('a_primitive')
- hyperparameters = dict()
+ hyperparameters = {
+ 'a_list_param': ['a']
+ }
mlblock._hyperparameters = hyperparameters
- returned = mlblock.get_hyperparameters()
+ mlblock.produce()
- assert returned == hyperparameters
- assert returned is not hyperparameters
+ assert 'b' not in hyperparameters['a_list_param']
def test_set_hyperparameters_function(self):
pass
From 735f48d02f2d73f019d9623fcfcc0920abfb6904 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 17 Jul 2019 15:07:39 +0200
Subject: [PATCH 041/160] Add fit and produce default arg values
---
mlblocks/mlpipeline.py | 4 ++++
tests/test_mlpipeline.py | 37 ++++++++++++++++++++++++++++++++++++-
2 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 36b71b29..e19a68ee 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -330,6 +330,10 @@ def _get_block_args(self, block_name, block_args, context):
if variable in context:
kwargs[name] = context[variable]
+ elif 'default' in arg:
+ kwargs[name] = arg['default']
+ elif arg.get('required', True):
+ raise ValueError('Input variable {} not found in context'.format(variable))
return kwargs
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 906c2c61..2011f5ae 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -238,7 +238,42 @@ def test_set_hyperparameters_flat(self):
block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'})
def test__get_block_args(self):
- pass
+ input_names = {
+ 'a_block': {
+ 'arg_3': 'arg_3_alt'
+ }
+ }
+ pipeline = MLPipeline(list(), input_names=input_names)
+
+ block_args = [
+ {
+ 'name': 'arg_1',
+ },
+ {
+ 'name': 'arg_2',
+ 'default': 'arg_2_value'
+ },
+ {
+ 'name': 'arg_3',
+ },
+ {
+ 'name': 'arg_4',
+ 'required': False
+ },
+ ]
+ context = {
+ 'arg_1': 'arg_1_value',
+ 'arg_3_alt': 'arg_3_value'
+ }
+
+ args = pipeline._get_block_args('a_block', block_args, context)
+
+ expected = {
+ 'arg_1': 'arg_1_value',
+ 'arg_2': 'arg_2_value',
+ 'arg_3': 'arg_3_value',
+ }
+ assert args == expected
def test__get_outputs(self):
pass
From 2662fea39476dfc30914a9ded59caecdfe51ad0c Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Fri, 26 Jul 2019 19:03:12 +0200
Subject: [PATCH 042/160] Fix dependencies
---
setup.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/setup.py b/setup.py
index 98350606..0d9f766b 100644
--- a/setup.py
+++ b/setup.py
@@ -46,6 +46,7 @@
'ipython>=6.5.0',
'matplotlib>=2.2.3',
'autodocsumm>=0.1.10',
+ 'docutils<0.15,>=0.10', # botocore incompatibility with 0.15
# style check
'flake8>=3.5.0',
From ae6ab0983b10598a214fe9af2eb25e18a7442a5e Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Fri, 26 Jul 2019 23:22:25 +0200
Subject: [PATCH 043/160] Fix testing dependencies
---
setup.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/setup.py b/setup.py
index 0d9f766b..608e481d 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,8 @@
'pytest-cov>=2.6.0',
'mlprimitives>=0.1.3,<0.2',
'urllib3>=1.20,<1.25',
- 'setuptools>=41.0.0'
+ 'setuptools>=41.0.0',
+ 'numpy<1.17',
]
From cd005af297f72cc5b6cb6b29228de14de992b920 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 6 Aug 2019 14:03:10 +0200
Subject: [PATCH 044/160] Flexible filter searching
---
mlblocks/discovery.py | 36 +++++++++++++++++++++++++-----------
1 file changed, 25 insertions(+), 11 deletions(-)
diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index 40853de9..6d85f970 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -312,7 +312,23 @@ def _search_annotations(base_path, pattern, parts=None):
return annotations
-def _get_annotations_list(paths, loader, pattern, **metadata_filters):
+def _match_filter(annotation, key, value):
+ if '.' in key:
+ name, key = key.split('.', 1)
+ part = annotation.get(name) or dict()
+ return _match_filter(part, key, value)
+
+ annotation_value = annotation.get(key)
+ if not isinstance(annotation_value, type(value)):
+ if isinstance(annotation_value, (list, dict)):
+ return value in annotation_value
+ elif isinstance(value, (list, dict)):
+ return annotation_value in value
+
+ return annotation_value == value
+
+
+def _get_annotations_list(paths, loader, pattern, filters):
pattern = re.compile(pattern)
annotations = dict()
for base_path in paths:
@@ -321,10 +337,8 @@ def _get_annotations_list(paths, loader, pattern, **metadata_filters):
matching = list()
for name in sorted(annotations.values()):
annotation = loader(name)
- metadata = annotation.get('metadata', dict())
- for key, value in metadata_filters.items():
- metadata_value = metadata.get(key, '')
- if not re.search(value, metadata_value):
+ for key, value in filters.items():
+ if not _match_filter(annotation, key, value):
break
else:
@@ -333,11 +347,11 @@ def _get_annotations_list(paths, loader, pattern, **metadata_filters):
return matching
-def get_primitives_list(pattern='', **metadata_filters):
- return _get_annotations_list(
- get_primitives_paths(), load_primitive, pattern, **metadata_filters)
+def get_primitives_list(pattern='', filters=None):
+ filters = filters or dict()
+ return _get_annotations_list(get_primitives_paths(), load_primitive, pattern, filters)
-def get_pipelines_list(pattern='', **metadata_filters):
- return _get_annotations_list(
- get_pipelines_paths(), load_pipeline, pattern, **metadata_filters)
+def get_pipelines_list(pattern='', filters=None):
+ filters = filters or dict()
+ return _get_annotations_list(get_pipelines_paths(), load_pipeline, pattern, filters)
From 82ef5b53bd5ccd54c8971ae64479bc79d64f35ba Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 6 Aug 2019 21:04:13 +0200
Subject: [PATCH 045/160] Rename find_primitives and add tests
---
mlblocks/discovery.py | 10 +-
tests/test_discovery.py | 201 +++++++++++++++++++++++++++++++++++++++-
2 files changed, 203 insertions(+), 8 deletions(-)
diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index 6d85f970..db7ba40d 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -312,11 +312,11 @@ def _search_annotations(base_path, pattern, parts=None):
return annotations
-def _match_filter(annotation, key, value):
+def _match(annotation, key, value):
if '.' in key:
name, key = key.split('.', 1)
part = annotation.get(name) or dict()
- return _match_filter(part, key, value)
+ return _match(part, key, value)
annotation_value = annotation.get(key)
if not isinstance(annotation_value, type(value)):
@@ -338,7 +338,7 @@ def _get_annotations_list(paths, loader, pattern, filters):
for name in sorted(annotations.values()):
annotation = loader(name)
for key, value in filters.items():
- if not _match_filter(annotation, key, value):
+ if not _match(annotation, key, value):
break
else:
@@ -347,11 +347,11 @@ def _get_annotations_list(paths, loader, pattern, filters):
return matching
-def get_primitives_list(pattern='', filters=None):
+def find_primitives(pattern='', filters=None):
filters = filters or dict()
return _get_annotations_list(get_primitives_paths(), load_primitive, pattern, filters)
-def get_pipelines_list(pattern='', filters=None):
+def find_pipelines(pattern='', filters=None):
filters = filters or dict()
return _get_annotations_list(get_pipelines_paths(), load_pipeline, pattern, filters)
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index 3681611b..07fc0753 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -2,9 +2,10 @@
import json
import os
+import re
import tempfile
import uuid
-from unittest.mock import call, patch
+from unittest.mock import Mock, call, patch
import pytest
from pkg_resources import Distribution, EntryPoint
@@ -12,6 +13,10 @@
from mlblocks import discovery
FAKE_PRIMITIVES_PATH = 'this/is/a/fake'
+FAKE_PRIMITIVES_PATHS = [
+ 'this/is/another/fake',
+ 'this/is/yet/another/fake',
+]
def test__add_lookup_path_do_nothing():
@@ -81,9 +86,16 @@ def test__load_entry_points_entry_points(iep_mock):
attrs=['FAKE_PRIMITIVES_PATH'],
dist=Distribution()
)
+ another_primitives_ep = EntryPoint(
+ 'primitives',
+ 'tests.test_discovery',
+ attrs=['FAKE_PRIMITIVES_PATHS'],
+ dist=Distribution()
+ )
iep_mock.return_value = [
something_else_ep,
- primitives_ep
+ primitives_ep,
+ another_primitives_ep
]
# run
@@ -91,7 +103,9 @@ def test__load_entry_points_entry_points(iep_mock):
# assert
expected = [
- 'this/is/a/fake'
+ 'this/is/a/fake',
+ 'this/is/another/fake',
+ 'this/is/yet/another/fake',
]
assert paths == expected
@@ -196,3 +210,184 @@ def test__load_pipeline_success(load_mock, gpp_mock):
load_mock.assert_called_once_with('valid.pipeline', ['a', 'b'])
assert pipeline == load_mock.return_value
+
+
+@patch('mlblocks.discovery.os')
+def test__search_annotations(os_mock):
+ os_mock.path.abspath = os.path.abspath
+ os_mock.path.join = os.path.join
+ os_mock.path.exists.return_value = True
+ os_mock.listdir.side_effect = [
+ [
+ 'a.primitive.json',
+ 'another.primitive.json',
+ 'some',
+ ],
+ [
+ 'other',
+ ],
+ [
+ 'primitive.json'
+ ]
+ ]
+ os_mock.path.isdir.return_value = False
+ os_mock.path.isdir.side_effect = [
+ False,
+ False,
+ True,
+ True,
+ False
+ ]
+
+ pattern = re.compile('other')
+ annotations = discovery._search_annotations('/path/to', pattern)
+
+ assert annotations == {
+ '/path/to/another.primitive.json': 'another.primitive',
+ '/path/to/some/other/primitive.json': 'some.other.primitive'
+ }
+
+
+def test__match_no_match():
+ annotation = {
+ 'name': 'a.primitive',
+ }
+
+ matches = discovery._match(annotation, 'key', 'value')
+
+ assert not matches
+
+
+def test__match_root():
+ annotation = {
+ 'name': 'a.primitive',
+ 'key': 'value'
+ }
+
+ matches = discovery._match(annotation, 'key', 'value')
+
+ assert matches
+
+
+def test__match_sublevel():
+ annotation = {
+ 'name': 'a.primitive',
+ 'some': {
+ 'sublevel': {
+ 'key': 'value'
+ }
+ }
+ }
+
+ matches = discovery._match(annotation, 'some.sublevel.key', 'value')
+
+ assert matches
+
+
+def test__match_list_no_match():
+ annotation = {
+ 'name': 'a.primitive',
+ 'key': [
+ 'another_value'
+ 'yet_another_value'
+ ]
+ }
+
+ matches = discovery._match(annotation, 'key', 'value')
+
+ assert not matches
+
+
+def test__match_list():
+ annotation = {
+ 'name': 'a.primitive',
+ 'key': [
+ 'value',
+ 'another_value'
+ ]
+ }
+
+ matches = discovery._match(annotation, 'key', 'value')
+
+ assert matches
+
+
+def test__match_dict():
+ annotation = {
+ 'name': 'a.primitive',
+ 'key': {
+ 'value': 'subvalue',
+ 'another_value': 'another_subvalue'
+ }
+ }
+
+ matches = discovery._match(annotation, 'key', 'value')
+
+ assert matches
+
+
+def test__match_multiple_keys():
+ annotation = {
+ 'name': 'a.primitive',
+ 'key': 'value'
+ }
+
+ matches = discovery._match(annotation, 'key', ['value', 'another_value'])
+
+ assert matches
+
+
+@patch('mlblocks.discovery._search_annotations')
+def test__get_annotations_list(search_annotations_mock):
+ search_annotations_mock.return_value = {
+ '/path/to/a/classifier.primitive.json': 'classifier.primitive',
+ '/path/to/a/regressor.primitive.json': 'regressor.primitive',
+ }
+
+ loader = Mock()
+ loader.side_effect = [
+ {
+ 'name': 'classifier.primitive',
+ 'classifiers': {
+ 'type': 'estimator',
+ 'subtype': 'classifier',
+ }
+ },
+ {
+ 'name': 'regressor.primitive',
+ 'classifiers': {
+ 'type': 'estimator',
+ 'subtype': 'regressor',
+ }
+ }
+ ]
+
+ filters = {
+ 'classifiers.subtype': 'regressor'
+ }
+ annotations = discovery._get_annotations_list(['/a/path'], loader, 'pattern', filters)
+
+ assert annotations == ['regressor.primitive']
+ search_annotations_mock.assert_called_once_with('/a/path', re.compile('pattern'))
+
+
+@patch('mlblocks.discovery._get_annotations_list')
+@patch('mlblocks.discovery.get_primitives_paths')
+def test_find_primitives(gpp_mock, gal_mock):
+ primitives = discovery.find_primitives('pattern')
+
+ gal_mock.assert_called_once_with(
+ gpp_mock.return_value, discovery.load_primitive, 'pattern', dict())
+
+ assert primitives == gal_mock.return_value
+
+
+@patch('mlblocks.discovery._get_annotations_list')
+@patch('mlblocks.discovery.get_pipelines_paths')
+def test_find_primitives(gpp_mock, gal_mock):
+ primitives = discovery.find_pipelines('pattern', {'a': 'filter'})
+
+ gal_mock.assert_called_once_with(
+ gpp_mock.return_value, discovery.load_pipeline, 'pattern', {'a': 'filter'})
+
+ assert primitives == gal_mock.return_value
From 1ca63500c1c86fc973005ad2d3c2a768f685f13a Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 6 Aug 2019 21:08:44 +0200
Subject: [PATCH 046/160] rename method
---
tests/test_discovery.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index 07fc0753..bf148571 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -384,7 +384,7 @@ def test_find_primitives(gpp_mock, gal_mock):
@patch('mlblocks.discovery._get_annotations_list')
@patch('mlblocks.discovery.get_pipelines_paths')
-def test_find_primitives(gpp_mock, gal_mock):
+def test_find_pipelines(gpp_mock, gal_mock):
primitives = discovery.find_pipelines('pattern', {'a': 'filter'})
gal_mock.assert_called_once_with(
From ec4609f45929defff7e64a09c81a866810774d4f Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 7 Aug 2019 12:10:34 +0200
Subject: [PATCH 047/160] Add docstrings and rename a few methods
---
mlblocks/discovery.py | 151 ++++++++++++++++++++++++++++++++++++----
tests/test_discovery.py | 26 ++++---
2 files changed, 149 insertions(+), 28 deletions(-)
diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index db7ba40d..9a1dbef5 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -297,6 +297,29 @@ def load_pipeline(name):
def _search_annotations(base_path, pattern, parts=None):
+ """Search for annotations within the given path.
+
+ If the indicated path has subfolders, search recursively within them.
+
+ If a pattern is given, return only the annotations whose name
+ matches the pattern.
+
+ Args:
+ base_path (str):
+ path to the folder to be searched for annotations.
+ pattern (str):
+ Regular expression to search in the annotation names.
+ parts (list):
+ Optional. List containing the parent folders that are also part
+ of the annotation name. Used during recursion to be able to
+ build the final annotation name before returning it.
+
+ Returns:
+ dict:
+ dictionary containing paths as keys and annotation names as
+ values.
+ """
+ pattern = re.compile(pattern)
annotations = dict()
parts = parts or list()
if os.path.exists(base_path):
@@ -312,24 +335,70 @@ def _search_annotations(base_path, pattern, parts=None):
return annotations
-def _match(annotation, key, value):
- if '.' in key:
- name, key = key.split('.', 1)
- part = annotation.get(name) or dict()
- return _match(part, key, value)
+def _match(annotation, key, values):
+ """Check if the anotation has the key and it matches any of the values.
+
+ If the given key is not found but it contains dots, split by the dots
+ and consider each part a sublevel in the annotation.
+
+ If the key value within the annotation is a list or a dict, check
+ whether any of the given values is contained within it instead of
+ checking for equality.
+
+ Args:
+ annotation (dict):
+ Dictionary annotation.
+ key (str):
+ Key to search within the annoation. It can contain dots to
+ separated nested subdictionary levels within the annotation.
+ values (object or list):
+ Value or list of values to search for.
- annotation_value = annotation.get(key)
- if not isinstance(annotation_value, type(value)):
+ Returns:
+ bool:
+ whether there is a match or not.
+ """
+ if not isinstance(values, list):
+ values = [values]
+
+ if key not in annotation:
+ if '.' in key:
+ name, key = key.split('.', 1)
+ part = annotation.get(name) or dict()
+ return _match(part, key, values)
+ else:
+ return False
+
+ annotation_value = annotation[key]
+
+ for value in values:
if isinstance(annotation_value, (list, dict)):
return value in annotation_value
- elif isinstance(value, (list, dict)):
- return annotation_value in value
+ elif annotation_value == value:
+ return True
- return annotation_value == value
+ return False
-def _get_annotations_list(paths, loader, pattern, filters):
- pattern = re.compile(pattern)
+def _find_annotations(paths, loader, pattern, filters):
+ """Find matching annotations within the given paths.
+
+ Math annotations by both name pattern and filters.
+
+ Args:
+ paths (list):
+ List of paths to search annotations in.
+ loader (callable):
+ Function to use to load the annotation contents.
+ pattern (str):
+ Pattern to match against the annotation name.
+ filters (dict):
+ Dictionary containing key/value filters.
+
+ Returns:
+ list:
+ names of the matching annotations.
+ """
annotations = dict()
for base_path in paths:
annotations.update(_search_annotations(base_path, pattern))
@@ -348,10 +417,64 @@ def _get_annotations_list(paths, loader, pattern, filters):
def find_primitives(pattern='', filters=None):
+ """Find primitives by name and filters.
+
+ If a patter is given, only the primitives whose name matches
+ the pattern will be returned.
+
+ If filters are given, they should be a dictionary containing key/value
+ filters that will have to be matched within the primitive annotation
+ for it to be included in the results.
+
+ If the given key is not found but it contains dots, split by the dots
+ and consider each part a sublevel in the annotation.
+
+ If the key value within the annotation is a list or a dict, check
+ whether any of the given values is contained within it instead of
+ checking for equality.
+
+ Args:
+ pattern (str):
+ Regular expression to match agains the primitive names.
+ filters (dict):
+ Dictionary containing the filters to apply over the matchin
+ primitives.
+
+ Returns:
+ list:
+ Names of the matching primitives.
+ """
filters = filters or dict()
- return _get_annotations_list(get_primitives_paths(), load_primitive, pattern, filters)
+ return _find_annotations(get_primitives_paths(), load_primitive, pattern, filters)
def find_pipelines(pattern='', filters=None):
+ """Find pipelines by name and filters.
+
+ If a patter is given, only the pipelines whose name matches
+ the pattern will be returned.
+
+ If filters are given, they should be a dictionary containing key/value
+ filters that will have to be matched within the pipeline annotation
+ for it to be included in the results.
+
+ If the given key is not found but it contains dots, split by the dots
+ and consider each part a sublevel in the annotation.
+
+ If the key value within the annotation is a list or a dict, check
+ whether any of the given values is contained within it instead of
+ checking for equality.
+
+ Args:
+ pattern (str):
+ Regular expression to match agains the pipeline names.
+ filters (dict):
+ Dictionary containing the filters to apply over the matchin
+ pipelines.
+
+ Returns:
+ list:
+ Names of the matching pipelines.
+ """
filters = filters or dict()
- return _get_annotations_list(get_pipelines_paths(), load_pipeline, pattern, filters)
+ return _find_annotations(get_pipelines_paths(), load_pipeline, pattern, filters)
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index bf148571..dc3eca87 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -2,7 +2,6 @@
import json
import os
-import re
import tempfile
import uuid
from unittest.mock import Mock, call, patch
@@ -239,8 +238,7 @@ def test__search_annotations(os_mock):
False
]
- pattern = re.compile('other')
- annotations = discovery._search_annotations('/path/to', pattern)
+ annotations = discovery._search_annotations('/path/to', 'other')
assert annotations == {
'/path/to/another.primitive.json': 'another.primitive',
@@ -338,7 +336,7 @@ def test__match_multiple_keys():
@patch('mlblocks.discovery._search_annotations')
-def test__get_annotations_list(search_annotations_mock):
+def test__find_annotations(search_annotations_mock):
search_annotations_mock.return_value = {
'/path/to/a/classifier.primitive.json': 'classifier.primitive',
'/path/to/a/regressor.primitive.json': 'regressor.primitive',
@@ -365,29 +363,29 @@ def test__get_annotations_list(search_annotations_mock):
filters = {
'classifiers.subtype': 'regressor'
}
- annotations = discovery._get_annotations_list(['/a/path'], loader, 'pattern', filters)
+ annotations = discovery._find_annotations(['/a/path'], loader, 'pattern', filters)
assert annotations == ['regressor.primitive']
- search_annotations_mock.assert_called_once_with('/a/path', re.compile('pattern'))
+ search_annotations_mock.assert_called_once_with('/a/path', 'pattern')
-@patch('mlblocks.discovery._get_annotations_list')
+@patch('mlblocks.discovery._find_annotations')
@patch('mlblocks.discovery.get_primitives_paths')
-def test_find_primitives(gpp_mock, gal_mock):
+def test_find_primitives(gpp_mock, fa_mock):
primitives = discovery.find_primitives('pattern')
- gal_mock.assert_called_once_with(
+ fa_mock.assert_called_once_with(
gpp_mock.return_value, discovery.load_primitive, 'pattern', dict())
- assert primitives == gal_mock.return_value
+ assert primitives == fa_mock.return_value
-@patch('mlblocks.discovery._get_annotations_list')
+@patch('mlblocks.discovery._find_annotations')
@patch('mlblocks.discovery.get_pipelines_paths')
-def test_find_pipelines(gpp_mock, gal_mock):
+def test_find_pipelines(gpp_mock, fa_mock):
primitives = discovery.find_pipelines('pattern', {'a': 'filter'})
- gal_mock.assert_called_once_with(
+ fa_mock.assert_called_once_with(
gpp_mock.return_value, discovery.load_pipeline, 'pattern', {'a': 'filter'})
- assert primitives == gal_mock.return_value
+ assert primitives == fa_mock.return_value
From 69a30cafcae4a776e8d1ed09c41116d1c82d2bee Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 7 Aug 2019 12:36:51 +0200
Subject: [PATCH 048/160] Update README
---
README.md | 91 +++++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 72 insertions(+), 19 deletions(-)
diff --git a/README.md b/README.md
index 5b4f2519..2d49d8a6 100644
--- a/README.md
+++ b/README.md
@@ -13,19 +13,14 @@
Pipelines and Primitives for Machine Learning and Data Science.
-[![PyPi][pypi-img]][pypi-url]
-[![Travis][travis-img]][travis-url]
-[![CodeCov][codecov-img]][codecov-url]
-
-[pypi-img]: https://img.shields.io/pypi/v/mlblocks.svg
-[pypi-url]: https://pypi.python.org/pypi/mlblocks
-[travis-img]: https://travis-ci.org/HDI-Project/MLBlocks.svg?branch=master
-[travis-url]: https://travis-ci.org/HDI-Project/MLBlocks
-[codecov-img]: https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg
-[codecov-url]: https://codecov.io/gh/HDI-Project/MLBlocks
+[](https://pypi.python.org/pypi/mlblocks)
+[](https://travis-ci.org/HDI-Project/MLBlocks)
+[](https://codecov.io/gh/HDI-Project/MLBlocks)
+[](https://pepy.tech/project/mlblocks)
* Free software: MIT license
* Documentation: https://HDI-Project.github.io/MLBlocks
+- Homepage: https://github.com/HDI-Project/MLBlocks
# Overview
@@ -44,24 +39,82 @@ Features include:
outputs per primitive.
* Easy save and load Pipelines using JSON Annotations.
-# Installation
+# Install
+
+## Requirements
+
+**MLBlocks** has been developed and tested on [Python 3.5 and 3.6](https://www.python.org/downloads/)
+
+Also, although it is not strictly required, the usage of a
+[virtualenv](https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid
+interfering with other software installed in the system where **MLBlocks** is run.
+
+These are the minimum commands needed to create a virtualenv using python3.6 for **MLBlocks**:
+
+```bash
+pip install virtualenv
+virtualenv -p $(which python3.6) mlblocks-venv
+```
-The simplest and recommended way to install MLBlocks is using `pip`:
+Afterwards, you have to execute this command to have the virtualenv activated:
+
+```bash
+source mlblocks-venv/bin/activate
+```
+
+Remember about executing it every time you start a new console to work on **MLBlocks**!
+
+## Install with pip
+
+After creating the virtualenv and activating it, we recommend using
+[pip](https://pip.pypa.io/en/stable/) in order to install **MLBlocks**:
```bash
pip install mlblocks
```
-Alternatively, you can also clone the repository and install it from sources
+This will pull and install the latest stable release from [PyPi](https://pypi.org/).
+
+## Install from source
+
+Alternatively, with your virtualenv activated, you can clone the repository and install it from
+source by running `make install` on the `stable` branch:
```bash
git clone git@github.com:HDI-Project/MLBlocks.git
cd MLBlocks
+git checkout stable
make install
```
-For development, you can use `make install-develop` instead in order to install all
-the required dependencies for testing and code linting.
+## Install for Development
+
+If you want to contribute to the project, a few more steps are required to make the project ready
+for development.
+
+First, please head to [the GitHub page of the project](https://github.com/HDI-Project/MLBlocks)
+and make a fork of the project under you own username by clicking on the **fork** button on the
+upper right corner of the page.
+
+Afterwards, clone your fork and create a branch from master with a descriptive name that includes
+the number of the issue that you are going to work on:
+
+```bash
+git clone git@github.com:{your username}/MLBlocks.git
+cd MLBlocks
+git branch issue-xx-cool-new-feature master
+git checkout issue-xx-cool-new-feature
+```
+
+Finally, install the project with the following command, which will install some additional
+dependencies for code linting and testing.
+
+```bash
+make install-develop
+```
+
+Make sure to use them regularly while developing by running the commands `make lint` and `make test`.
+
## MLPrimitives
@@ -75,12 +128,12 @@ with this command:
pip install mlprimitives
```
-# Usage Example
+# Quickstart
Below there is a short example about how to use MLBlocks to create a simple pipeline, fit it
using demo data and use it to make predictions.
-Please make sure to having installed [MLPrimitives](https://github.com/HDI-Project/MLPrimitives)
+Please make sure to also having installed [MLPrimitives](https://github.com/HDI-Project/MLPrimitives)
before following it.
For advance usage and more detailed explanation about each component, please have a look
@@ -153,7 +206,7 @@ its `get_hyperparameters` method:
}
```
-### Making predictions
+## Making predictions
Once we have created the pipeline with the desired hyperparameters we can fit it
and then use it to make predictions on new data.
@@ -180,7 +233,7 @@ to obtain predictions from the pipeline.
array([3, 2, 1, ..., 1, 1, 2])
```
-## What's Next?
+# What's Next?
If you want to learn more about how to tune the pipeline hyperparameters, save and load
the pipelines using JSON annotations or build complex multi-branched pipelines, please
From 6324d3ffaad0fc45ad00fabc9c43de0b6e92ebf0 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 7 Aug 2019 12:38:25 +0200
Subject: [PATCH 049/160] Update README title
---
README.md | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 2d49d8a6..19f740ed 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,6 @@
An open source project from Data to AI Lab at MIT.
-
-
@@ -22,7 +20,7 @@ Pipelines and Primitives for Machine Learning and Data Science.
* Documentation: https://HDI-Project.github.io/MLBlocks
- Homepage: https://github.com/HDI-Project/MLBlocks
-# Overview
+# MLBlocks
MLBlocks is a simple framework for composing end-to-end tunable Machine Learning Pipelines by
seamlessly combining tools from any python library with a simple, common and uniform interface.
From c2771588f0d65e7ad3fdde9c71d3979ecd3dca3a Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 7 Aug 2019 13:05:45 +0200
Subject: [PATCH 050/160] Update dependencies
---
setup.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/setup.py b/setup.py
index 608e481d..4c371761 100644
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,7 @@
tests_require = [
'pytest>=3.4.2',
'pytest-cov>=2.6.0',
- 'mlprimitives>=0.1.3,<0.2',
+ 'mlprimitives>=0.2,<0.3',
'urllib3>=1.20,<1.25',
'setuptools>=41.0.0',
'numpy<1.17',
From b65d7c77fd0b4275fb287e77867a8a43471ee3b3 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 7 Aug 2019 13:34:23 +0200
Subject: [PATCH 051/160] Fix docs quickstart
---
Makefile | 2 +-
docs/getting_started/quickstart.rst | 1 +
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/Makefile b/Makefile
index 6266033f..e54e1362 100644
--- a/Makefile
+++ b/Makefile
@@ -112,7 +112,7 @@ test: ## run tests quickly with the default Python
.PHONY: test-all
test-all: ## run tests on every Python version with tox
- tox
+ tox -r
.PHONY: coverage
coverage: ## check code coverage quickly with the default Python
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
index 2887da05..31be89ee 100644
--- a/docs/getting_started/quickstart.rst
+++ b/docs/getting_started/quickstart.rst
@@ -24,6 +24,7 @@ them to the `MLPipeline class`_:
from mlblocks import MLPipeline
primitives = [
+ 'mlprimitives.custom.feature_extraction.CategoricalEncoder',
'mlprimitives.custom.feature_extraction.StringVectorizer',
'sklearn.ensemble.RandomForestClassifier',
]
From c189a7f267613e71ba8632c1b5ca80bf1be79043 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 7 Aug 2019 14:20:24 +0200
Subject: [PATCH 052/160] Add metadata attribute
---
mlblocks/mlblock.py | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index c32f978a..5727384e 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -146,22 +146,22 @@ def _get_tunable(cls, hyperparameters, init_params):
def __init__(self, name, **kwargs):
self.name = name
- primitive = load_primitive(name)
+ self.metadata = load_primitive(name)
- self.primitive = import_object(primitive['primitive'])
+ self.primitive = import_object(self.metadata['primitive'])
- self._fit = primitive.get('fit', dict())
+ self._fit = self.metadata.get('fit', dict())
self.fit_args = self._fit.get('args', [])
self.fit_method = self._fit.get('method')
- self._produce = primitive['produce']
+ self._produce = self.metadata['produce']
self.produce_args = self._produce['args']
self.produce_output = self._produce['output']
self.produce_method = self._produce.get('method')
self._class = bool(self.produce_method)
- hyperparameters = primitive.get('hyperparameters', dict())
+ hyperparameters = self.metadata.get('hyperparameters', dict())
init_params, fit_params, produce_params = self._extract_params(kwargs, hyperparameters)
self._hyperparameters = init_params
From c78c1373f03aa82ef73cbfcffa2d48f051eb4cbf Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 8 Aug 2019 15:42:36 +0200
Subject: [PATCH 053/160] Allow passing fit and produce args as init params
---
mlblocks/mlblock.py | 25 ++++++++-------
mlblocks/mlpipeline.py | 37 +++++++++++-----------
tests/features/test_fit_predicr_args.py | 42 +++++++++++++++++++++++++
tests/test_mlblock.py | 25 ++++++++++++---
tests/test_mlpipeline.py | 1 -
5 files changed, 96 insertions(+), 34 deletions(-)
create mode 100644 tests/features/test_fit_predicr_args.py
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 5727384e..db24caa5 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -13,8 +13,11 @@
def import_object(object_name):
"""Import an object from its Fully Qualified Name."""
- package, name = object_name.rsplit('.', 1)
- return getattr(importlib.import_module(package), name)
+ if isinstance(object_name, str):
+ package, name = object_name.rsplit('.', 1)
+ return getattr(importlib.import_module(package), name)
+
+ return object_name
class MLBlock():
@@ -27,7 +30,7 @@ class MLBlock():
Attributes:
name (str):
- Name given to this MLBlock.
+ Primitive name.
metadata (dict):
Additional information about this primitive
primitive (object):
@@ -46,8 +49,8 @@ class MLBlock():
function.
Args:
- name (str):
- Name given to this MLBlock.
+ primitive (str or dict):
+ primitive name or primitive dictionary.
**kwargs:
Any additional arguments that will be used as hyperparameters or passed to the
``fit`` or ``produce`` methods.
@@ -143,10 +146,12 @@ def _get_tunable(cls, hyperparameters, init_params):
return tunable
- def __init__(self, name, **kwargs):
- self.name = name
+ def __init__(self, primitive, **kwargs):
+ if isinstance(primitive, str):
+ primitive = load_primitive(primitive)
- self.metadata = load_primitive(name)
+ self.metadata = primitive
+ self.name = primitive['name']
self.primitive = import_object(self.metadata['primitive'])
@@ -252,11 +257,9 @@ def _get_method_kwargs(self, kwargs, method_args):
if name in kwargs:
value = kwargs[name]
-
elif 'default' in arg:
value = arg['default']
-
- else:
+ elif arg.get('required', True):
raise TypeError("missing expected argument '{}'".format(name))
method_kwargs[keyword] = value
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index e19a68ee..14e5ce67 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -87,16 +87,21 @@ def _build_blocks(self):
block_names_count = Counter()
for primitive in self.primitives:
+ if isinstance(primitive, str):
+ primitive_name = primitive
+ else:
+ primitive_name = primitive['name']
+
try:
- block_names_count.update([primitive])
- block_count = block_names_count[primitive]
- block_name = '{}#{}'.format(primitive, block_count)
+ block_names_count.update([primitive_name])
+ block_count = block_names_count[primitive_name]
+ block_name = '{}#{}'.format(primitive_name, block_count)
block_params = self.init_params.get(block_name, dict())
if not block_params:
- block_params = self.init_params.get(primitive, dict())
+ block_params = self.init_params.get(primitive_name, dict())
if block_params and block_count > 1:
LOGGER.warning(("Non-numbered init_params are being used "
- "for more than one block %s."), primitive)
+ "for more than one block %s."), primitive_name)
block = MLBlock(primitive, **block_params)
blocks[block_name] = block
@@ -330,10 +335,6 @@ def _get_block_args(self, block_name, block_args, context):
if variable in context:
kwargs[name] = context[variable]
- elif 'default' in arg:
- kwargs[name] = arg['default']
- elif arg.get('required', True):
- raise ValueError('Input variable {} not found in context'.format(variable))
return kwargs
@@ -517,11 +518,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
the value of that variable from the context will extracted and returned
after the produce method of that block has been called.
"""
- context = {
- 'X': X,
- 'y': y
- }
- context.update(kwargs)
+ context = kwargs.copy()
+ if X is not None:
+ context['X'] = X
+
+ if y is not None:
+ context['y'] = y
output_block, output_variable = self._get_output_spec(output_)
last_block_name = self._get_block_name(-1)
@@ -624,10 +626,9 @@ def predict(self, X=None, output_=None, start_=None, **kwargs):
the value of that variable from the context will extracted and returned
after the produce method of that block has been called.
"""
- context = {
- 'X': X
- }
- context.update(kwargs)
+ context = kwargs.copy()
+ if X is not None:
+ context['X'] = X
output_block, output_variable = self._get_output_spec(output_)
diff --git a/tests/features/test_fit_predicr_args.py b/tests/features/test_fit_predicr_args.py
new file mode 100644
index 00000000..af4c0aea
--- /dev/null
+++ b/tests/features/test_fit_predicr_args.py
@@ -0,0 +1,42 @@
+from mlblocks.mlpipeline import MLPipeline
+
+
+def test_fit_predict_args_in_init():
+
+ def add(a, b):
+ return a + b
+
+ primitive = {
+ 'name': 'add',
+ 'primitive': add,
+ 'produce': {
+ 'args': [
+ {
+ 'name': 'a',
+ 'type': 'float',
+ },
+ {
+ 'name': 'b',
+ 'type': 'float',
+ },
+ ],
+ 'output': [
+ {
+ 'type': 'float',
+ 'name': 'out'
+ }
+ ]
+ }
+ }
+
+ primitives = [primitive]
+ init_params = {
+ 'add': {
+ 'b': 10
+ }
+ }
+ pipeline = MLPipeline(primitives, init_params=init_params)
+
+ out = pipeline.predict(a=3)
+
+ assert out == 13
diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py
index 16f1c6d1..b4dbc637 100644
--- a/tests/test_mlblock.py
+++ b/tests/test_mlblock.py
@@ -323,6 +323,7 @@ def test__get_tunable_condition_match_null(self):
@patch('mlblocks.mlblock.load_primitive')
def test___init__(self, load_primitive_mock, import_object_mock, set_hps_mock):
load_primitive_mock.return_value = {
+ 'name': 'a_primitive_name',
'primitive': 'a_primitive_name',
'produce': {
'args': [
@@ -335,9 +336,22 @@ def test___init__(self, load_primitive_mock, import_object_mock, set_hps_mock):
}
}
- mlblock = MLBlock('given_primitive_name', argument='value')
+ mlblock = MLBlock('a_primitive_name', argument='value')
- assert mlblock.name == 'given_primitive_name'
+ assert mlblock.metadata == {
+ 'name': 'a_primitive_name',
+ 'primitive': 'a_primitive_name',
+ 'produce': {
+ 'args': [
+ {
+ 'name': 'argument'
+ }
+ ],
+ 'output': [
+ ]
+ }
+ }
+ assert mlblock.name == 'a_primitive_name'
assert mlblock.primitive == import_object_mock.return_value
assert mlblock._fit == dict()
assert mlblock.fit_args == list()
@@ -370,6 +384,7 @@ def test___init__(self, load_primitive_mock, import_object_mock, set_hps_mock):
@patch('mlblocks.mlblock.load_primitive')
def test___str__(self, load_primitive_mock, import_object_mock):
load_primitive_mock.return_value = {
+ 'name': 'a_primitive_name',
'primitive': 'a_primitive_name',
'produce': {
'args': [],
@@ -377,15 +392,16 @@ def test___str__(self, load_primitive_mock, import_object_mock):
}
}
- mlblock = MLBlock('given_primitive_name')
+ mlblock = MLBlock('a_primitive_name')
- assert str(mlblock) == 'MLBlock - given_primitive_name'
+ assert str(mlblock) == 'MLBlock - a_primitive_name'
@patch('mlblocks.mlblock.import_object')
@patch('mlblocks.mlblock.load_primitive')
def test_get_tunable_hyperparameters(self, load_primitive_mock, import_object_mock):
"""get_tunable_hyperparameters has to return a copy of the _tunables attribute."""
load_primitive_mock.return_value = {
+ 'name': 'a_primitive_name',
'primitive': 'a_primitive_name',
'produce': {
'args': [],
@@ -433,6 +449,7 @@ def primitive(a_list_param):
io_mock.return_value = primitive
lp_mock.return_value = {
+ 'name': 'a_primitive',
'primitive': 'a_primitive',
'produce': {
'args': [],
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 2011f5ae..327387f5 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -270,7 +270,6 @@ def test__get_block_args(self):
expected = {
'arg_1': 'arg_1_value',
- 'arg_2': 'arg_2_value',
'arg_3': 'arg_3_value',
}
assert args == expected
From badd7f176e4d5df9b89d0e083224a0c33257c807 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 12 Aug 2019 13:10:31 +0200
Subject: [PATCH 054/160] Add release notest for v0.3.2
---
HISTORY.md | 24 ++++++++++++++----------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/HISTORY.md b/HISTORY.md
index e6b14674..f2654353 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,19 +1,23 @@
Changelog
=========
+0.3.2 - 2019-08-12
+------------------
+
+* Allow passing fit and produce arguments as `init_params` - [Issue #96](https://github.com/HDI-Project/MLBlocks/issues/96) by @csala
+* Support optional fit and produce args and arg defaults - [Issue #95](https://github.com/HDI-Project/MLBlocks/issues/95) by @csala
+* Isolate primitives from their hyperparameters dictionary - [Issue #94](https://github.com/HDI-Project/MLBlocks/issues/94) by @csala
+* Add functions to explore the available primitives and pipelines - [Issue #90](https://github.com/HDI-Project/MLBlocks/issues/90) by @csala
+* Add primitive caching New Feature - [Issue #22](https://github.com/HDI-Project/MLBlocks/issues/22) by @csala
+
0.3.1 - Pipelines Discovery
---------------------------
-* Support flat hyperparameter dictionaries
- [Issue #92](https://github.com/HDI-Project/MLBlocks/issues/92) by @csala
-* Load pipelines by name and register them as `entry_points`
- [Issue #88](https://github.com/HDI-Project/MLBlocks/issues/88) by @csala
-* Implement partial re-fit
- [Issue #61](https://github.com/HDI-Project/MLBlocks/issues/61) by @csala
-* Move argument parsing to MLBlock
- [Issue #86](https://github.com/HDI-Project/MLBlocks/issues/86) by @csala
-* Allow getting intermediate outputs
- [Issue #58](https://github.com/HDI-Project/MLBlocks/issues/58) by @csala
+* Support flat hyperparameter dictionaries - [Issue #92](https://github.com/HDI-Project/MLBlocks/issues/92) by @csala
+* Load pipelines by name and register them as `entry_points` - [Issue #88](https://github.com/HDI-Project/MLBlocks/issues/88) by @csala
+* Implement partial re-fit -[Issue #61](https://github.com/HDI-Project/MLBlocks/issues/61) by @csala
+* Move argument parsing to MLBlock - [Issue #86](https://github.com/HDI-Project/MLBlocks/issues/86) by @csala
+* Allow getting intermediate outputs - [Issue #58](https://github.com/HDI-Project/MLBlocks/issues/58) by @csala
0.3.0 - New Primitives Discovery
--------------------------------
From a094e9f1f7543758a058c8dbf3cb443854cfcf4d Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 12 Aug 2019 13:11:23 +0200
Subject: [PATCH 055/160] =?UTF-8?q?Bump=20version:=200.3.2-dev=20=E2=86=92?=
=?UTF-8?q?=200.3.2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index b528aefe..9df5b210 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.3.2-dev'
+__version__ = '0.3.2'
__all__ = [
'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 1967b27b..97bb08a0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.3.2-dev
+current_version = 0.3.2
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))?
diff --git a/setup.py b/setup.py
index 4c371761..3514f943 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/HDI-Project/MLBlocks',
- version='0.3.2-dev',
+ version='0.3.2',
zip_safe=False,
)
From 14446f71c60213de2c3206e4beae25c5fa0f5d0e Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 12 Aug 2019 13:11:39 +0200
Subject: [PATCH 056/160] =?UTF-8?q?Bump=20version:=200.3.2=20=E2=86=92=200?=
=?UTF-8?q?.3.3-dev?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 9df5b210..7f6e1eaf 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.3.2'
+__version__ = '0.3.3-dev'
__all__ = [
'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 97bb08a0..a9051663 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.3.2
+current_version = 0.3.3-dev
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))?
diff --git a/setup.py b/setup.py
index 3514f943..870d1276 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/HDI-Project/MLBlocks',
- version='0.3.2',
+ version='0.3.3-dev',
zip_safe=False,
)
From 65610157d2cea9d42545587b36ef4628d10c2893 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 12 Aug 2019 13:13:19 +0200
Subject: [PATCH 057/160] Typo in the release notes
---
HISTORY.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/HISTORY.md b/HISTORY.md
index f2654353..c3b00ce0 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -8,7 +8,7 @@ Changelog
* Support optional fit and produce args and arg defaults - [Issue #95](https://github.com/HDI-Project/MLBlocks/issues/95) by @csala
* Isolate primitives from their hyperparameters dictionary - [Issue #94](https://github.com/HDI-Project/MLBlocks/issues/94) by @csala
* Add functions to explore the available primitives and pipelines - [Issue #90](https://github.com/HDI-Project/MLBlocks/issues/90) by @csala
-* Add primitive caching New Feature - [Issue #22](https://github.com/HDI-Project/MLBlocks/issues/22) by @csala
+* Add primitive caching - [Issue #22](https://github.com/HDI-Project/MLBlocks/issues/22) by @csala
0.3.1 - Pipelines Discovery
---------------------------
From 8c03242cb648a68f997e3ee0b3b6557623bd3b35 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 3 Sep 2019 14:34:26 +0200
Subject: [PATCH 058/160] Advanced intermediate outputs
---
mlblocks/mlpipeline.py | 430 ++++++++++++++-----------
tests/features/test_partial_outputs.py | 15 +-
tests/test_mlpipeline.py | 43 ++-
3 files changed, 280 insertions(+), 208 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 14e5ce67..b02561fe 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -5,6 +5,7 @@
import json
import logging
from collections import Counter, OrderedDict, defaultdict
+from copy import deepcopy
import numpy as np
@@ -72,6 +73,11 @@ class MLPipeline():
given when stored in the context dictionary. This allows storing the output of
different primitives in different variables, even if the primitive output name is
the same one.
+ outputs (dict):
+ dictionary containing lists of output variables associated to a name.
+ verbose (bool):
+ whether to log the exceptions that occur when running the pipeline before
+ raising them or not.
"""
def _get_tunable_hyperparameters(self):
@@ -114,7 +120,6 @@ def _build_blocks(self):
@staticmethod
def _get_pipeline_dict(pipeline, primitives):
-
if isinstance(pipeline, dict):
return pipeline
@@ -136,18 +141,50 @@ def _get_pipeline_dict(pipeline, primitives):
return dict()
+ def _get_block_outputs(self, block_name):
+ """Get the list of output variables for the given block."""
+ block = self.blocks[block_name]
+ outputs = deepcopy(block.produce_output)
+ for output in outputs:
+ output['variable'] = '{}.{}'.format(block_name, output['name'])
+
+ return outputs
+
+ def _get_outputs(self, pipeline, outputs):
+ """Get the output definitions from the pipeline dictionary.
+
+ If the ``"default"`` entry does not exist, it is built using the
+ outputs from the last block in the pipeline.
+ """
+ outputs = outputs or pipeline.get('outputs')
+ if outputs is None:
+ outputs = dict()
+
+ if 'default' not in outputs:
+ outputs['default'] = self._get_block_outputs(self._last_block_name)
+
+ return outputs
+
+ def _get_block_name(self, index):
+ """Get the name of the block in the ``index`` position."""
+ return list(self.blocks.keys())[index]
+
def __init__(self, pipeline=None, primitives=None, init_params=None,
- input_names=None, output_names=None):
+ input_names=None, output_names=None, outputs=None, verbose=True):
pipeline = self._get_pipeline_dict(pipeline, primitives)
self.primitives = primitives or pipeline['primitives']
self.init_params = init_params or pipeline.get('init_params', dict())
self.blocks = self._build_blocks()
+ self._last_block_name = self._get_block_name(-1)
self.input_names = input_names or pipeline.get('input_names', dict())
self.output_names = output_names or pipeline.get('output_names', dict())
+ self.outputs = self._get_outputs(pipeline, outputs)
+ self.verbose = verbose
+
tunable = pipeline.get('tunable_hyperparameters')
if tunable is not None:
self._tunable_hyperparameters = tunable
@@ -158,6 +195,122 @@ def __init__(self, pipeline=None, primitives=None, init_params=None,
if hyperparameters:
self.set_hyperparameters(hyperparameters)
+ def _get_str_output(self, output):
+ """Get the outputs that correspond to the str specification."""
+ if output in self.outputs:
+ return self.outputs[output]
+ elif output in self.blocks:
+ return self._get_block_outputs(output)
+ elif '.' in output:
+ block_name, variable_name = output.rsplit('.', 1)
+ block = self.blocks.get(block_name)
+ if not block:
+ raise ValueError('Invalid block name: {}'.format(block_name))
+
+ for variable in block.produce_output:
+ if variable['name'] == variable_name:
+ return [{'name': variable_name, 'variable': output}]
+
+ raise ValueError('Block {} has no output {}'.format(block_name, variable_name))
+
+ raise ValueError('Invalid Output Specification: {}'.format(output))
+
+ def get_outputs(self, outputs='default'):
+ """Get the list of output variables that correspond to the specified outputs.
+
+ Outputs specification can either be a single string, a single integer, or a
+ list of strings and integers.
+
+ If strings are given, they can either be one of the named outputs that have
+ been specified on the pipeline definition or the name of a block, including the
+ counter number at the end, or a full variable specification following the format
+ ``{block-name}.{variable-name}``.
+
+ Alternatively, integers can be passed as indexes of the blocks from which to get
+ the outputs.
+
+ If output specifications that resolve to multiple output variables are given,
+ such as the named outputs or block names, all the variables are concatenated
+ together, in order, in a single variable list.
+
+ Args:
+ outputs (str, int or list[str or int]):
+ Single or list of output specifications.
+
+ Returns:
+ list:
+ List of dictionaries specifying all the output variables. Each
+ dictionary contains the entries ``name`` and ``variable``, as
+ well as any other metadata that may have been included in the
+ pipeline outputs or block produce outputs specification.
+
+ Raises:
+ ValueError:
+ If an output specification is not valid.
+ TypeError:
+ If the type of a specification is not an str or an int.
+ """
+ if not isinstance(outputs, (list, tuple)):
+ outputs = (outputs, )
+
+ computed = list()
+ for output in outputs:
+ if isinstance(output, str):
+ computed.extend(self._get_str_output(output))
+ elif isinstance(output, int):
+ block_name = self._get_block_name(output)
+ computed.extend(self._get_block_outputs(block_name))
+ else:
+ raise TypeError('Output Specification can only be str or int')
+
+ return computed
+
+ def get_output_names(self, outputs='default'):
+ """Get the names of the outputs that correspond to the given specification.
+
+ The indicated outputs will be resolved and the names of the output variables
+ will be returned as a single list.
+
+ Args:
+ outputs (str, int or list[str or int]):
+ Single or list of output specifications.
+
+ Returns:
+ list:
+ List of variable names
+
+ Raises:
+ ValueError:
+ If an output specification is not valid.
+ TypeError:
+ If the type of a specification is not an str or an int.
+ """
+ outputs = self.get_outputs(outputs)
+ return [output['name'] for output in outputs]
+
+ def get_output_variables(self, outputs='default'):
+ """Get the list of variable specifications of the given outputs.
+
+ The indicated outputs will be resolved and their variables specifications
+ will be returned as a single list.
+
+ Args:
+ outputs (str, int or list[str or int]):
+ Single or list of output specifications.
+
+ Returns:
+ list:
+ List of variable specifications.
+
+ Raises:
+ ValueError:
+ If an output specification is not valid.
+ TypeError:
+ If the type of a specification is not an str or an int.
+ """
+ outputs = self.get_outputs(outputs)
+ return [output['variable'] for output in outputs]
+
@staticmethod
def _flatten_dict(hyperparameters):
return {
@@ -361,96 +514,48 @@ def _extract_outputs(self, block_name, outputs, block_outputs):
return output_dict
- def _get_block_name(self, index):
- """Get the name of the block in the ``index`` position."""
- return list(self.blocks.keys())[index]
-
- def _get_output_spec(self, output):
- """Parse the output specification and get a block name and a variable name.
-
- The output specification can be of two types: int and str.
-
- If it is an integer, it is interpreted as a block index, and the variable name
- is considered to be ``None``, which means that the whole context will be returned.
-
- If it is a string, it can be interpreted in three ways:
-
- * **block name**: If the string matches a block name exactly, including
- its hash and counter number ``#n`` at the end, the whole context will be
- returned after that block is produced.
- * **variable_name**: If the string does not match any block name and does
- not contain any dot characted, ``'.'``, it will be considered a variable
- name. In this case, the indicated variable will be extracted from the
- context and returned after the last block has been produced.
- * **block_name + variable_name**: If the complete string does not match a
- block name but it contains at least one dot, ``'.'``, it will be split
- in two parts on the last dot. If the first part of the string matches a
- block name exactly, the second part of the string will be considered a
- variable name, assuming the format ``{block_name}.{variable_name}``, and
- the indicated variable will be extracted from the context and returned
- after the block has been produced. Otherwise, if the extracted
- ``block_name`` does not match a block name exactly, a ``ValueError``
- will be raised.
+ def _update_outputs(self, block_name, output_variables, outputs, outputs_dict):
+ """Set the requested block outputs into the outputs list in the right place."""
+ for key, value in outputs_dict.items():
+ variable_name = '{}.{}'.format(block_name, key)
+ if variable_name in output_variables:
+ index = output_variables.index(variable_name)
+ outputs[index] = deepcopy(value)
+
+ def _fit_block(self, block, block_name, context):
+ """Get the block args from the context and fit the block."""
+ LOGGER.debug("Fitting block %s", block_name)
+ try:
+ fit_args = self._get_block_args(block_name, block.fit_args, context)
+ block.fit(**fit_args)
+ except Exception:
+ if self.verbose:
+ LOGGER.exception("Exception caught fitting MLBlock %s", block_name)
- Args:
- output (str or int):
- Output specification as either a string or an integer.
+ raise
- Raises:
- ValueError:
- If the output string contains dots but it does not match a block
- name exactly
+ def _produce_block(self, block, block_name, context, output_variables, outputs):
+ """Get the block args from the context and produce the block.
- Returns:
- tuple:
- The output is a tuple containing:
- * block_name (str): name of the block from which the output will be
- returned, including its counter number.
- * variable_name (str): Name of the variable to extract from the context.
- It can be ``None``, which means that the whole context is to be
- returned.
+ Afterwards, set the block outputs back into the context and update
+ the outputs list if necessary.
"""
- # If None is given, both block and varialbe are None
- if output is None:
- return None, None
-
- # If an int is given, it is a block index and there is no variable
- if isinstance(output, int):
- output = self._get_block_name(output)
- return output, None
-
- # If the string matches a block name, there is no variable
- if output in self.blocks:
- return output, None
-
- # If there is at least one dot in the output, but it did not match
- # a block name, it is considered to be {block_name}.{variable_name}
- if '.' in output:
- output_block, output_variable = output.rsplit('.', 1)
- if output_block not in self.blocks:
- raise ValueError('Unknown block name: {}'.format(output_block))
-
- return output_block, output_variable
-
- # If the given string is not a block name and it has no dots,
- # it is considered to be a variable name to be extracted
- # from the context after the last block has been produced
- last_block_name = self._get_block_name(-1)
- return last_block_name, output
-
- def _get_output(self, output_variable, context):
- """Get the specified output variable from the context.
-
- If the variable name is ``None``, return the entire context.
- """
- if output_variable:
- if output_variable not in context:
- raise ValueError('Output variable {} not found in context'
- .format(output_variable))
+ LOGGER.debug("Producing block %s", block_name)
+ try:
+ produce_args = self._get_block_args(block_name, block.produce_args, context)
+ block_outputs = block.produce(**produce_args)
- return context[output_variable]
- else:
- return context
+ outputs_dict = self._extract_outputs(block_name, block_outputs, block.produce_output)
+ context.update(outputs_dict)
+
+ if output_variables:
+ self._update_outputs(block_name, output_variables, outputs, outputs_dict)
+
+ except Exception:
+ if self.verbose:
+ LOGGER.exception("Exception caught producing MLBlock %s", block_name)
+
+ raise
def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
"""Fit the blocks of this pipeline.
@@ -467,35 +572,13 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
Args:
X:
Fit Data, which the pipeline will learn from.
-
y:
Fit Data labels, which the pipeline will use to learn how to
behave.
- output_ (str or int or None):
- Output specification, which can be a string or an integer or None.
-
- * If it is None (default), nothing will be returned
- * If an integer is given, it is interpreted as the block number, and the whole
- context after running the specified block will be returned.
- * If it is a string, it can be interpreted in three ways:
-
- * **block name**: If the string matches a block name exactly, including
- its hash and counter number ``#n`` at the end, the whole context will be
- returned after that block is produced.
- * **variable_name**: If the string does not match any block name and does
- not contain any dot characted, ``'.'``, it will be considered a variable
- name. In this case, the indicated variable will be extracted from the
- context and returned after the last block has been produced.
- * **block_name + variable_name**: If the complete string does not match a
- block name but it contains at least one dot, ``'.'``, it will be split
- in two parts on the last dot. If the first part of the string matches a
- block name exactly, the second part of the string will be considered a
- variable name, assuming the format ``{block_name}.{variable_name}``, and
- the indicated variable will be extracted from the context and returned
- after the block has been produced. Otherwise, if the extracted
- ``block_name`` does not match a block name exactly, a ``ValueError``
- will be raised.
+ output_ (str or int or list or None):
+ Output specification, as required by ``get_outputs``. If ``None`` is given,
+ nothing will be returned.
start_ (str or int or None):
Block index or block name to start processing from. The
@@ -510,13 +593,9 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
Returns:
None or dict or object:
- * If no output is specified, nothing will be returned.
- * If an output block has been specified without and output variable, the
- context dictionary will be returned after the produce method of that block
- has been called.
- * If both an output block and an output variable have been specified,
- the value of that variable from the context will extracted and returned
- after the produce method of that block has been called.
+ * If no ``output`` is specified, nothing will be returned.
+ * If ``output_`` has been specified, either a single value or a
+ tuple of values will be returned.
"""
context = kwargs.copy()
if X is not None:
@@ -525,8 +604,14 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
if y is not None:
context['y'] = y
- output_block, output_variable = self._get_output_spec(output_)
- last_block_name = self._get_block_name(-1)
+ if output_ is not None:
+ output_variables = self.get_output_variables(output_)
+ outputs = output_variables.copy()
+ output_blocks = {variable.rsplit('.', 1)[0] for variable in output_variables}
+ else:
+ output_variables = None
+ outputs = None
+ output_blocks = set()
if isinstance(start_, int):
start_ = self._get_block_name(start_)
@@ -539,34 +624,28 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
LOGGER.debug("Skipping block %s fit", block_name)
continue
- LOGGER.debug("Fitting block %s", block_name)
- try:
- fit_args = self._get_block_args(block_name, block.fit_args, context)
- block.fit(**fit_args)
- except Exception:
- LOGGER.exception("Exception caught fitting MLBlock %s", block_name)
- raise
+ self._fit_block(block, block_name, context)
- if (block_name != last_block_name) or (block_name == output_block):
- LOGGER.debug("Producing block %s", block_name)
- try:
- produce_args = self._get_block_args(block_name, block.produce_args, context)
- outputs = block.produce(**produce_args)
+ if (block_name != self._last_block_name) or (block_name in output_blocks):
+ self._produce_block(block, block_name, context, output_variables, outputs)
- output_dict = self._extract_outputs(block_name, outputs, block.produce_output)
- context.update(output_dict)
- except Exception:
- LOGGER.exception("Exception caught producing MLBlock %s", block_name)
- raise
+ # We already captured the output from this block
+ if block_name in output_blocks:
+ output_blocks.remove(block_name)
- if block_name == output_block:
- return self._get_output(output_variable, context)
+ # If there was an output_ but there are no pending
+ # outputs we are done.
+ if output_ is not None and not output_blocks:
+ if len(outputs) > 1:
+ return tuple(outputs)
+ else:
+ return outputs[0]
if start_:
# We skipped all the blocks up to the end
raise ValueError('Unknown block name: {}'.format(start_))
- def predict(self, X=None, output_=None, start_=None, **kwargs):
+ def predict(self, X=None, output_='default', start_=None, **kwargs):
"""Produce predictions using the blocks of this pipeline.
Sequentially call the ``produce`` method of each block, capturing the
@@ -581,29 +660,9 @@ def predict(self, X=None, output_=None, start_=None, **kwargs):
X:
Data which the pipeline will use to make predictions.
- output_ (str or int or None):
- Output specification, which can be a string or an integer or None.
- * If it is None (default), the output of the last block will be returned.
- * If an integer is given, it is interpreted as the block number, and the whole
- context after running the specified block will be returned.
- * If it is a string, it can be interpreted in three ways:
-
- * **block name**: If the string matches a block name exactly, including
- its hash and counter number ``#n`` at the end, the whole context will be
- returned after that block is produced.
- * **variable_name**: If the string does not match any block name and does
- not contain any dot characted, ``'.'``, it will be considered a variable
- name. In this case, the indicated variable will be extracted from the
- context and returned after the last block has been produced.
- * **block_name + variable_name**: If the complete string does not match a
- block name but it contains at least one dot, ``'.'``, it will be split
- in two parts on the last dot. If the first part of the string matches a
- block name exactly, the second part of the string will be considered a
- variable name, assuming the format ``{block_name}.{variable_name}``, and
- the indicated variable will be extracted from the context and returned
- after the block has been produced. Otherwise, if the extracted
- ``block_name`` does not match a block name exactly, a ``ValueError``
- will be raised.
+ output_ (str or int or list or None):
+ Output specification, as required by ``get_outputs``. If not specified
+ the ``default`` output will be returned.
start_ (str or int or None):
Block index or block name to start processing from. The
@@ -617,20 +676,17 @@ def predict(self, X=None, output_=None, start_=None, **kwargs):
to the context dictionary and available for the blocks.
Returns:
- None or dict or object:
- * If no output is specified, the output of the last block will be returned.
- * If an output block has been specified without and output variable, the
- context dictionary will be returned after the produce method of that block
- has been called.
- * If both an output block and an output variable have been specified,
- the value of that variable from the context will extracted and returned
- after the produce method of that block has been called.
+ object or tuple:
+ * If a single output is requested, it is returned alone.
+ * If multiple outputs have been requested, a tuple is returned.
"""
context = kwargs.copy()
if X is not None:
context['X'] = X
- output_block, output_variable = self._get_output_spec(output_)
+ output_variables = self.get_output_variables(output_)
+ outputs = output_variables.copy()
+ output_blocks = {variable.rsplit('.', 1)[0] for variable in output_variables}
if isinstance(start_, int):
start_ = self._get_block_name(start_)
@@ -643,27 +699,24 @@ def predict(self, X=None, output_=None, start_=None, **kwargs):
LOGGER.debug("Skipping block %s produce", block_name)
continue
- LOGGER.debug("Producing block %s", block_name)
- try:
- produce_args = self._get_block_args(block_name, block.produce_args, context)
- outputs = block.produce(**produce_args)
- output_dict = self._extract_outputs(block_name, outputs, block.produce_output)
- context.update(output_dict)
+ self._produce_block(block, block_name, context, output_variables, outputs)
- if block_name == output_block:
- return self._get_output(output_variable, context)
+ # We already captured the output from this block
+ if block_name in output_blocks:
+ output_blocks.remove(block_name)
- except Exception:
- LOGGER.exception("Exception caught producing MLBlock %s", block_name)
- raise
+ # If there was an output_ but there are no pending
+ # outputs we are done.
+ if not output_blocks:
+ if len(outputs) > 1:
+ return tuple(outputs)
+ else:
+ return outputs[0]
if start_:
# We skipped all the blocks up to the end
raise ValueError('Unknown block name: {}'.format(start_))
- if output_ is None:
- return outputs
-
def to_dict(self):
"""Return all the details of this MLPipeline in a dict.
@@ -710,7 +763,8 @@ def to_dict(self):
'input_names': self.input_names,
'output_names': self.output_names,
'hyperparameters': self.get_hyperparameters(),
- 'tunable_hyperparameters': self._tunable_hyperparameters
+ 'tunable_hyperparameters': self._tunable_hyperparameters,
+ 'outputs': self.outputs,
}
def save(self, path):
diff --git a/tests/features/test_partial_outputs.py b/tests/features/test_partial_outputs.py
index ce28d457..7098dcd7 100644
--- a/tests/features/test_partial_outputs.py
+++ b/tests/features/test_partial_outputs.py
@@ -40,7 +40,7 @@ def test_fit_output(self):
invalid_int = 10
str_block = 'sklearn.preprocessing.StandardScaler#1'
invalid_block = 'InvalidBlockName'
- str_block_variable = 'sklearn.preprocessing.StandardScaler#1.y'
+ str_block_variable = 'sklearn.preprocessing.StandardScaler#1.X'
invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid'
# Run
@@ -58,16 +58,9 @@ def test_fit_output(self):
[0.71269665, -0.645124, 0.39067021, 0.31740553],
[0.26726124, -0.10752067, 1.36734573, 1.55176035]
])
- y = np.array([1, 0, 0, 1, 2])
- context = {
- 'X': X,
- 'y': y
- }
- almost_equal(context, int_out)
- almost_equal(context, str_out)
-
- almost_equal(y, str_out_variable)
-
+ almost_equal(X, int_out)
+ almost_equal(X, str_out)
+ almost_equal(X, str_out_variable)
assert no_output is None
# Run asserting exceptions
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 327387f5..3f6121ea 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -2,7 +2,7 @@
from collections import OrderedDict
from unittest import TestCase
-from unittest.mock import Mock, call, patch
+from unittest.mock import MagicMock, Mock, call, patch
from mlblocks.mlpipeline import MLPipeline
@@ -12,7 +12,15 @@ class TestMLPipline(TestCase):
@patch('mlblocks.mlpipeline.LOGGER')
@patch('mlblocks.mlpipeline.MLBlock')
def test___init__(self, mlblock_mock, logger_mock):
- blocks = [Mock(), Mock(), Mock(), Mock()]
+ blocks = [Mock(), Mock(), Mock()]
+ last_block = Mock()
+ last_block.produce_output = [
+ {
+ 'name': 'y',
+ 'type': 'array'
+ }
+ ]
+ blocks.append(last_block)
mlblock_mock.side_effect = blocks
primitives = [
@@ -61,6 +69,16 @@ def test___init__(self, mlblock_mock, logger_mock):
'another.primitive.Name#1': blocks[2].get_tunable_hyperparameters.return_value,
'another.primitive.Name#2': blocks[3].get_tunable_hyperparameters.return_value
}
+ assert mlpipeline.outputs == {
+ 'default': [
+ {
+ 'name': 'y',
+ 'type': 'array',
+ 'variable': 'another.primitive.Name#2.y'
+ }
+ ]
+ }
+ assert mlpipeline.verbose
expected_calls = [
call('a.primitive.Name', an_argument='value'),
@@ -75,8 +93,9 @@ def test___init__(self, mlblock_mock, logger_mock):
'a.primitive.Name'
)
+ @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
def test_get_tunable_hyperparameters(self):
- mlpipeline = MLPipeline(list())
+ mlpipeline = MLPipeline(['a_primitive'])
tunable = dict()
mlpipeline._tunable_hyperparameters = tunable
@@ -85,8 +104,9 @@ def test_get_tunable_hyperparameters(self):
assert returned == tunable
assert returned is not tunable
+ @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
def test_get_tunable_hyperparameters_flat(self):
- mlpipeline = MLPipeline(list())
+ mlpipeline = MLPipeline(['a_primitive'])
tunable = {
'block_1': {
'hp_1': {
@@ -141,6 +161,7 @@ def test_get_tunable_hyperparameters_flat(self):
}
assert returned == expected
+ @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
def test_get_hyperparameters(self):
block_1 = Mock()
block_1.get_hyperparameters.return_value = {
@@ -155,7 +176,7 @@ def test_get_hyperparameters(self):
('a.primitive.Name#1', block_1),
('a.primitive.Name#2', block_2),
))
- mlpipeline = MLPipeline(list())
+ mlpipeline = MLPipeline(['a_primitive'])
mlpipeline.blocks = blocks
hyperparameters = mlpipeline.get_hyperparameters()
@@ -172,6 +193,7 @@ def test_get_hyperparameters(self):
block_1.get_hyperparameters.assert_called_once_with()
block_2.get_hyperparameters.assert_called_once_with()
+ @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
def test_get_hyperparameters_flat(self):
block_1 = Mock()
block_1.get_hyperparameters.return_value = {
@@ -186,7 +208,7 @@ def test_get_hyperparameters_flat(self):
('a.primitive.Name#1', block_1),
('a.primitive.Name#2', block_2),
))
- mlpipeline = MLPipeline(list())
+ mlpipeline = MLPipeline(['a_primitive'])
mlpipeline.blocks = blocks
hyperparameters = mlpipeline.get_hyperparameters(flat=True)
@@ -199,6 +221,7 @@ def test_get_hyperparameters_flat(self):
block_1.get_hyperparameters.assert_called_once_with()
block_2.get_hyperparameters.assert_called_once_with()
+ @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
def test_set_hyperparameters(self):
block_1 = Mock()
block_2 = Mock()
@@ -206,7 +229,7 @@ def test_set_hyperparameters(self):
('a.primitive.Name#1', block_1),
('a.primitive.Name#2', block_2),
))
- mlpipeline = MLPipeline(list())
+ mlpipeline = MLPipeline(['a_primitive'])
mlpipeline.blocks = blocks
hyperparameters = {
@@ -219,6 +242,7 @@ def test_set_hyperparameters(self):
block_1.set_hyperparameters.assert_not_called()
block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'})
+ @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
def test_set_hyperparameters_flat(self):
block_1 = Mock()
block_2 = Mock()
@@ -226,7 +250,7 @@ def test_set_hyperparameters_flat(self):
('a.primitive.Name#1', block_1),
('a.primitive.Name#2', block_2),
))
- mlpipeline = MLPipeline(list())
+ mlpipeline = MLPipeline(['a_primitive'])
mlpipeline.blocks = blocks
hyperparameters = {
@@ -237,13 +261,14 @@ def test_set_hyperparameters_flat(self):
block_1.set_hyperparameters.assert_not_called()
block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'})
+ @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
def test__get_block_args(self):
input_names = {
'a_block': {
'arg_3': 'arg_3_alt'
}
}
- pipeline = MLPipeline(list(), input_names=input_names)
+ pipeline = MLPipeline(['a_primitive'], input_names=input_names)
block_args = [
{
From dabf1a13dfa05f73e70e9b4578b08bca7ace7edc Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 3 Sep 2019 22:16:10 +0200
Subject: [PATCH 059/160] Add unit tests
---
mlblocks/mlpipeline.py | 4 +-
tests/features/test_partial_outputs.py | 57 ++++++---
tests/test_mlpipeline.py | 164 ++++++++++++++++++++++++-
3 files changed, 201 insertions(+), 24 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index b02561fe..9de286cb 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -209,7 +209,9 @@ def _get_str_output(self, output):
for variable in block.produce_output:
if variable['name'] == variable_name:
- return [{'name': variable_name, 'variable': output}]
+ output_variable = deepcopy(variable)
+ output_variable['variable'] = output
+ return [output_variable]
raise ValueError('Block {} has no output {}'.format(block_name, variable_name))
diff --git a/tests/features/test_partial_outputs.py b/tests/features/test_partial_outputs.py
index 7098dcd7..d31d2dd8 100644
--- a/tests/features/test_partial_outputs.py
+++ b/tests/features/test_partial_outputs.py
@@ -3,7 +3,6 @@
import numpy as np
-from mlblocks.datasets import load_iris
from mlblocks.mlpipeline import MLPipeline
@@ -15,6 +14,7 @@ def almost_equal(obj1, obj2):
for key, value in obj1.items():
if key not in obj2:
raise AssertionError("{} not in {}".format(key, obj2))
+
almost_equal(value, obj2[key])
else:
@@ -23,9 +23,14 @@ def almost_equal(obj1, obj2):
class TestPartialOutputs(TestCase):
def setUp(self):
- dataset = load_iris()
-
- self.X_train, self.X_test, self.y_train, self.y_test = dataset.get_splits(1)
+ self.X = np.array([
+ [1, 0, 0, 0, 0],
+ [0, 1, 0, 0, 0],
+ [0, 0, 1, 0, 0],
+ [0, 0, 0, 1, 0],
+ [0, 0, 0, 0, 1],
+ ])
+ self.y = np.array([0, 0, 0, 0, 1])
def test_fit_output(self):
@@ -36,6 +41,8 @@ def test_fit_output(self):
]
pipeline = MLPipeline(primitives)
+ named = 'default'
+ list_ = ['default', 0]
int_block = 0
invalid_int = 10
str_block = 'sklearn.preprocessing.StandardScaler#1'
@@ -44,20 +51,30 @@ def test_fit_output(self):
invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid'
# Run
- int_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=int_block)
- str_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=str_block)
- str_out_variable = pipeline.fit(self.X_train[0:5], self.y_train[0:5],
+ named_out = pipeline.fit(self.X, self.y, output_=named)
+ list_out = pipeline.fit(self.X, self.y, output_=list_)
+ int_out = pipeline.fit(self.X, self.y, output_=int_block)
+ str_out = pipeline.fit(self.X, self.y, output_=str_block)
+ str_out_variable = pipeline.fit(self.X, self.y,
output_=str_block_variable)
- no_output = pipeline.fit(self.X_train, self.y_train)
+ no_output = pipeline.fit(self.X, self.y)
# Assert successful calls
X = np.array([
- [0.71269665, -1.45152899, 0.55344946, 0.31740553],
- [0.26726124, 1.23648766, -1.1557327, -1.0932857],
- [-1.95991577, 0.967686, -1.1557327, -1.0932857],
- [0.71269665, -0.645124, 0.39067021, 0.31740553],
- [0.26726124, -0.10752067, 1.36734573, 1.55176035]
+ [2., -0.5, -0.5, -0.5, -0.5],
+ [-0.5, 2., -0.5, -0.5, -0.5],
+ [-0.5, -0.5, 2., -0.5, -0.5],
+ [-0.5, -0.5, -0.5, 2., -0.5],
+ [-0.5, -0.5, -0.5, -0.5, 2.],
])
+ y = np.array([
+ 0, 0, 0, 0, 1
+ ])
+
+ almost_equal(named_out, y)
+ assert len(list_out) == 2
+ almost_equal(list_out[0], y)
+ almost_equal(list_out[1], X)
almost_equal(X, int_out)
almost_equal(X, str_out)
almost_equal(X, str_out_variable)
@@ -65,13 +82,13 @@ def test_fit_output(self):
# Run asserting exceptions
with self.assertRaises(IndexError):
- pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_int)
+ pipeline.fit(self.X, self.y, output_=invalid_int)
with self.assertRaises(ValueError):
- pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_block)
+ pipeline.fit(self.X, self.y, output_=invalid_block)
with self.assertRaises(ValueError):
- pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_variable)
+ pipeline.fit(self.X, self.y, output_=invalid_variable)
def test_fit_start(self):
# Setup variables
@@ -87,8 +104,8 @@ def test_fit_start(self):
# Run first block
context = {
- 'X': self.X_train,
- 'y': self.y_train
+ 'X': self.X,
+ 'y': self.y
}
int_start = 1
str_start = 'sklearn.linear_model.LogisticRegression#1'
@@ -106,7 +123,7 @@ def test_predict_start(self):
'sklearn.linear_model.LogisticRegression'
]
pipeline = MLPipeline(primitives)
- pipeline.fit(self.X_train, self.y_train)
+ pipeline.fit(self.X, self.y)
# Mock the first block
block_mock = Mock()
@@ -114,7 +131,7 @@ def test_predict_start(self):
# Run first block
context = {
- 'X': self.X_train,
+ 'X': self.X,
}
int_start = 1
str_start = 'sklearn.linear_model.LogisticRegression#1'
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 3f6121ea..7062e38e 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -107,7 +107,7 @@ def test_get_tunable_hyperparameters(self):
@patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
def test_get_tunable_hyperparameters_flat(self):
mlpipeline = MLPipeline(['a_primitive'])
- tunable = {
+ mlpipeline._tunable_hyperparameters = {
'block_1': {
'hp_1': {
'type': 'int',
@@ -133,7 +133,6 @@ def test_get_tunable_hyperparameters_flat(self):
}
}
}
- mlpipeline._tunable_hyperparameters = tunable
returned = mlpipeline.get_tunable_hyperparameters(flat=True)
@@ -299,9 +298,168 @@ def test__get_block_args(self):
}
assert args == expected
- def test__get_outputs(self):
+ @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ def test__get_outputs_no_outputs(self):
+ self_ = Mock()
+ self_._last_block_name = 'last_block'
+ self_._get_block_outputs.return_value = ['some', 'outputs']
+
+ pipeline = dict()
+ outputs = None
+ returned = MLPipeline._get_outputs(self_, pipeline, outputs)
+
+ expected = {
+ 'default': ['some', 'outputs']
+ }
+ assert returned == expected
+
+ self_._get_block_outputs.assert_called_once_with('last_block')
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ def test__get_outputs_defaults(self):
+ self_ = Mock()
+
+ pipeline = dict()
+ outputs = {
+ 'default': ['some', 'outputs']
+ }
+ returned = MLPipeline._get_outputs(self_, pipeline, outputs)
+
+ expected = {
+ 'default': ['some', 'outputs']
+ }
+ assert returned == expected
+ self_._get_block_outputs.assert_not_called()
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ def test__get_outputs_additional(self):
+ self_ = Mock()
+
+ pipeline = {
+ 'outputs': {
+ 'default': ['some', 'outputs'],
+ 'additional': ['other', 'outputs']
+ }
+ }
+ outputs = None
+ returned = MLPipeline._get_outputs(self_, pipeline, outputs)
+
+ expected = {
+ 'default': ['some', 'outputs'],
+ 'additional': ['other', 'outputs']
+ }
+ assert returned == expected
+ self_._get_block_outputs.assert_not_called()
+
+ def test_get_outputs_str(self):
+ pass
+
+ def test_get_outputs_int(self):
+ pass
+
+ def test_get_outputs_list_of_str(self):
+ pass
+
+ def test_get_outputs_list_of_int(self):
pass
+ def test_get_outputs_named_outputs(self):
+ pass
+
+ def test_get_outputs_combination(self):
+ pass
+
+ @patch('mlblocks.mlpipeline.MLBlock')
+ def test_get_outputs_invalid(self, mlblock_mock):
+ outputs = {
+ 'default': [
+ {
+ 'name': 'a_name',
+ 'variable': 'a_variable',
+ 'type': 'a_type',
+ }
+ ],
+ 'debug': [
+ {
+ 'name': 'another_name',
+ 'variable': 'another_variable',
+ }
+ ]
+ }
+ mlblock_mock.side_effect = [MagicMock(), MagicMock()]
+ pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs)
+
+ pipeline.blocks['a_primitive#1'].produce_output = [
+ {
+ 'name': 'output',
+ 'type': 'whatever'
+ }
+ ]
+ pipeline.blocks['another_primitive#1'].produce_output = [
+ {
+ 'name': 'something',
+ }
+ ]
+
+ returned = pipeline.get_outputs(['default', 'debug', -1, 'a_primitive#1.output'])
+
+ expected = [
+ {
+ 'name': 'a_name',
+ 'variable': 'a_variable',
+ 'type': 'a_type'
+ },
+ {
+ 'name': 'another_name',
+ 'variable': 'another_variable',
+ },
+ {
+ 'name': 'something',
+ 'variable': 'another_primitive#1.something',
+ },
+ {
+ 'name': 'output',
+ 'type': 'whatever',
+ 'variable': 'a_primitive#1.output'
+ }
+ ]
+
+ assert returned == expected
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ def test_get_output_names(self):
+ outputs = {
+ 'default': [
+ {
+ 'name': 'a_name',
+ 'variable': 'a_variable',
+ 'type': 'a_type',
+ }
+ ]
+ }
+ pipeline = MLPipeline(['a_primitive'], outputs=outputs)
+
+ names = pipeline.get_output_names()
+
+ assert names == ['a_name']
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ def test_get_output_variables(self):
+ outputs = {
+ 'default': [
+ {
+ 'name': 'a_name',
+ 'variable': 'a_variable',
+ 'type': 'a_type',
+ }
+ ]
+ }
+ pipeline = MLPipeline(['a_primitive'], outputs=outputs)
+
+ names = pipeline.get_output_variables()
+
+ assert names == ['a_variable']
+
def test_fit(self):
pass
From 1d74b20a4ded2d95b46067a6f280b989d16312ef Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 9 Sep 2019 11:29:20 +0200
Subject: [PATCH 060/160] Release notes for v0.3.3
---
HISTORY.md | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/HISTORY.md b/HISTORY.md
index c3b00ce0..f3dc0a32 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,11 @@
Changelog
=========
+0.3.3 - 2019-09-09
+------------------
+
+* Improved intermediate outputs management - [Issue #105](https://github.com/HDI-Project/MLBlocks/issues/105) by @csala
+
0.3.2 - 2019-08-12
------------------
From 3b06ab885dce7fc601468dc6ee8f1b901bfba8ff Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 9 Sep 2019 11:29:24 +0200
Subject: [PATCH 061/160] =?UTF-8?q?Bump=20version:=200.3.3-dev=20=E2=86=92?=
=?UTF-8?q?=200.3.3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 7f6e1eaf..b85b1de0 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.3.3-dev'
+__version__ = '0.3.3'
__all__ = [
'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index a9051663..0fa10faa 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.3.3-dev
+current_version = 0.3.3
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))?
diff --git a/setup.py b/setup.py
index 870d1276..4104d912 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/HDI-Project/MLBlocks',
- version='0.3.3-dev',
+ version='0.3.3',
zip_safe=False,
)
From 0dcb324c1d7e09e0a04d61dd400105afa7d6c8a5 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 9 Sep 2019 11:29:43 +0200
Subject: [PATCH 062/160] =?UTF-8?q?Bump=20version:=200.3.3=20=E2=86=92=200?=
=?UTF-8?q?.3.4-dev?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index b85b1de0..8c30609e 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.3.3'
+__version__ = '0.3.4-dev'
__all__ = [
'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 0fa10faa..de7507c0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.3.3
+current_version = 0.3.4-dev
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))?
diff --git a/setup.py b/setup.py
index 4104d912..421dbbd6 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/HDI-Project/MLBlocks',
- version='0.3.3',
+ version='0.3.4-dev',
zip_safe=False,
)
From eb78b55e5466b918ed1be8f0a699538b69d26773 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Fri, 27 Sep 2019 16:13:45 +0200
Subject: [PATCH 063/160] support importing class methods
---
mlblocks/mlblock.py | 12 ++++++++++--
tests/test_mlblock.py | 35 ++++++++++++++++++++++++++++++-----
2 files changed, 40 insertions(+), 7 deletions(-)
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index db24caa5..f570165b 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -13,9 +13,17 @@
def import_object(object_name):
"""Import an object from its Fully Qualified Name."""
+
if isinstance(object_name, str):
- package, name = object_name.rsplit('.', 1)
- return getattr(importlib.import_module(package), name)
+ parent_name, attribute = object_name.rsplit('.', 1)
+ try:
+ parent = importlib.import_module(parent_name)
+ except ImportError:
+ grand_parent_name, parent_name = parent_name.rsplit('.', 1)
+ grand_parent = importlib.import_module(grand_parent_name)
+ parent = getattr(grand_parent, parent_name)
+
+ return getattr(parent, attribute)
return object_name
diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py
index b4dbc637..355015d0 100644
--- a/tests/test_mlblock.py
+++ b/tests/test_mlblock.py
@@ -3,19 +3,44 @@
from unittest import TestCase
from unittest.mock import MagicMock, Mock, patch
-from mlblocks.mlblock import MLBlock, import_object
+import pytest
-# import pytest
+from mlblocks.mlblock import MLBlock, import_object
class DummyClass:
+ def a_method(self):
+ pass
+
+
+def dummy_function():
pass
-def test_import_object():
- dummy_class = import_object(__name__ + '.DummyClass')
+class TestImportObject(TestCase):
+
+ def test_class(self):
+ imported = import_object(__name__ + '.DummyClass')
+
+ assert imported is DummyClass
+
+ def test_class_method(self):
+ imported = import_object(__name__ + '.DummyClass.a_method')
+
+ assert imported is DummyClass.a_method
+
+ def test_function(self):
+ imported = import_object(__name__ + '.dummy_function')
+
+ assert imported is dummy_function
+
+ def test_bad_object_name(self):
+ with pytest.raises(AttributeError):
+ import_object(__name__ + '.InvalidName')
- assert dummy_class is DummyClass
+ def test_bad_module(self):
+ with pytest.raises(ModuleNotFoundError):
+ import_object('an.invalid.module')
class TestMLBlock(TestCase):
From 8d53d2a94fcc4323dcc0faf37bc38535b8198fa7 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Fri, 27 Sep 2019 16:17:25 +0200
Subject: [PATCH 064/160] Add configuration to upload release candidates to
PyPI
---
Makefile | 26 +++++++++++++++++++-------
mlblocks/__init__.py | 2 +-
setup.cfg | 15 +++++++++++----
setup.py | 2 +-
tox.ini | 2 +-
5 files changed, 33 insertions(+), 14 deletions(-)
diff --git a/Makefile b/Makefile
index e54e1362..6e8dd203 100644
--- a/Makefile
+++ b/Makefile
@@ -155,7 +155,7 @@ publish: dist ## package and upload a release
.PHONY: bumpversion-release
bumpversion-release: ## Merge master to stable and bumpversion release
- git checkout stable
+ git checkout stable || git checkout -b stable
git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable"
bumpversion release
git push --tags origin stable
@@ -167,6 +167,10 @@ bumpversion-patch: ## Merge stable to master and bumpversion patch
bumpversion --no-tag patch
git push
+.PHONY: bumpversion-candidate
+bumpversion-candidate: ## Bump the version to the next candidate
+ bumpversion candidate --no-tag
+
.PHONY: bumpversion-minor
bumpversion-minor: ## Bump the version the next minor skipping the release
bumpversion --no-tag minor
@@ -175,23 +179,31 @@ bumpversion-minor: ## Bump the version the next minor skipping the release
bumpversion-major: ## Bump the version the next major skipping the release
bumpversion --no-tag major
-CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD)
-CHANGELOG_LINES := $(shell git diff HEAD..stable HISTORY.md | wc -l)
+CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null)
+CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l)
-.PHONY: check-release
-check-release: ## Check if the release can be made
+.PHONY: check-master
+check-master: ## Check if we are in master branch
ifneq ($(CURRENT_BRANCH),master)
$(error Please make the release from master branch\n)
endif
+
+.PHONY: check-history
+check-history: ## Check if HISTORY.md has been modified
ifeq ($(CHANGELOG_LINES),0)
$(error Please insert the release notes in HISTORY.md before releasing)
-else
- @echo "A new release can be made"
endif
+.PHONY: check-release
+check-release: check-master check-history ## Check if the release can be made
+ @echo "A new release can be made"
+
.PHONY: release
release: check-release bumpversion-release publish bumpversion-patch
+.PHONY: release-candidate
+release-candidate: check-master publish bumpversion-candidate
+
.PHONY: release-minor
release-minor: check-release bumpversion-minor release
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 8c30609e..3ede651e 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.3.4-dev'
+__version__ = '0.3.4.dev0'
__all__ = [
'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index de7507c0..563c9c5c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,18 +1,21 @@
[bumpversion]
-current_version = 0.3.4-dev
+current_version = 0.3.4.dev0
commit = True
tag = True
-parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))?
+parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))?
serialize =
- {major}.{minor}.{patch}-{release}
+ {major}.{minor}.{patch}-{release}{candidate}
{major}.{minor}.{patch}
[bumpversion:part:release]
optional_value = release
+first_value = dev
values =
dev
release
+[bumpversion:part:candidate]
+
[bumpversion:file:setup.py]
search = version='{current_version}'
replace = version='{new_version}'
@@ -34,8 +37,12 @@ include_trailing_comment = True
line_length = 99
lines_between_types = 0
multi_line_output = 4
-not_skip = __init__.py
use_parentheses = True
+not_skip = __init__.py
+skip_glob = *.bak
+
+[metadata]
+description-file = README.md
[aliases]
test = pytest
diff --git a/setup.py b/setup.py
index 421dbbd6..abc43800 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/HDI-Project/MLBlocks',
- version='0.3.4-dev',
+ version='0.3.4.dev0',
zip_safe=False,
)
diff --git a/tox.ini b/tox.ini
index 76529366..666eeab0 100644
--- a/tox.ini
+++ b/tox.ini
@@ -14,7 +14,7 @@ setenv =
PYTHONPATH = {toxinidir}
extras = test
commands =
- /usr/bin/env python -m pytest --cov=mlblocks
+ /usr/bin/env make test
[testenv:lint]
From 001561a169229ec652c08e4ace32dc3023d4bbd4 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Fri, 27 Sep 2019 16:20:05 +0200
Subject: [PATCH 065/160] Add release-candidate documentation
---
CONTRIBUTING.rst | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index 4fce53bf..4c01093e 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -195,3 +195,33 @@ Once this is done, run of the following commands:
3. If you are releasing a major version::
make release-major
+
+Release Candidates
+~~~~~~~~~~~~~~~~~~
+
+Sometimes it is necessary or convenient to upload a release candidate to PyPi as a pre-release,
+in order to make some of the new features available for testing on other projects before they
+are included in an actual full-blown release.
+
+In order to perform such an action, you can execute::
+
+ make release-candidate
+
+This will perform the following actions:
+
+1. Build and upload the current version to PyPi as a pre-release, with the format ``X.Y.Z.devN``
+
+2. Bump the current version to the next release candidate, ``X.Y.Z.dev(N+1)``
+
+After this is done, the new pre-release can be installed by including the ``dev`` section in the
+dependency specification, either in ``setup.py``::
+
+ install_requires = [
+ ...
+ 'mlblocks>=X.Y.Z.dev',
+ ...
+ ]
+
+or in command line::
+
+ pip install 'mlblocks>=X.Y.Z.dev'
From 45f9ae2ae6b50a4a6ae1e50f326c130f5a571d69 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Fri, 27 Sep 2019 16:25:45 +0200
Subject: [PATCH 066/160] Fix error in python3.5 due to an inexisting Exception
type
---
tests/test_mlblock.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py
index 355015d0..93adb0dd 100644
--- a/tests/test_mlblock.py
+++ b/tests/test_mlblock.py
@@ -39,7 +39,7 @@ def test_bad_object_name(self):
import_object(__name__ + '.InvalidName')
def test_bad_module(self):
- with pytest.raises(ModuleNotFoundError):
+ with pytest.raises(ImportError):
import_object('an.invalid.module')
From 09aa6e9466956d3883895b573d5ea03ad257b501 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 3 Oct 2019 20:20:52 +0200
Subject: [PATCH 067/160] Fix release-candidate version format
---
setup.cfg | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/setup.cfg b/setup.cfg
index 563c9c5c..a122a298 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,9 +2,9 @@
current_version = 0.3.4.dev0
commit = True
tag = True
-parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))?
+parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
serialize =
- {major}.{minor}.{patch}-{release}{candidate}
+ {major}.{minor}.{patch}.{release}{candidate}
{major}.{minor}.{patch}
[bumpversion:part:release]
From 2b5d7900a22ce72b7b59ef85a0f024ffec0a0c0d Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 3 Oct 2019 20:21:28 +0200
Subject: [PATCH 068/160] =?UTF-8?q?Bump=20version:=200.3.4.dev0=20?=
=?UTF-8?q?=E2=86=92=200.3.4.dev1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 3ede651e..81b45593 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.3.4.dev0'
+__version__ = '0.3.4.dev1'
__all__ = [
'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index a122a298..0c2ea21a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.3.4.dev0
+current_version = 0.3.4.dev1
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index abc43800..da4bb6f3 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/HDI-Project/MLBlocks',
- version='0.3.4.dev0',
+ version='0.3.4.dev1',
zip_safe=False,
)
From d790938e8cee8528fe90725eb145fde3c6bd99e2 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 30 Oct 2019 13:19:30 -0400
Subject: [PATCH 069/160] New partial output with context - WIP
---
mlblocks/mlpipeline.py | 17 +++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 9de286cb..d0d67f8f 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -145,8 +145,11 @@ def _get_block_outputs(self, block_name):
"""Get the list of output variables for the given block."""
block = self.blocks[block_name]
outputs = deepcopy(block.produce_output)
+ output_names = self.output_names.get(block_name, dict())
for output in outputs:
- output['variable'] = '{}.{}'.format(block_name, output['name'])
+ name = output['name']
+ context_name = output_names.get(name, name)
+ output['variable'] = '{}.{}'.format(block_name, context_name)
return outputs
@@ -606,7 +609,7 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
if y is not None:
context['y'] = y
- if output_ is not None:
+ if isinstance(output_, str):
output_variables = self.get_output_variables(output_)
outputs = output_variables.copy()
output_blocks = {variable.rsplit('.', 1)[0] for variable in output_variables}
@@ -615,6 +618,9 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
outputs = None
output_blocks = set()
+ if isinstance(output_, int):
+ output_ = self._get_block_name(output_)
+
if isinstance(start_, int):
start_ = self._get_block_name(start_)
@@ -628,16 +634,19 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
self._fit_block(block, block_name, context)
- if (block_name != self._last_block_name) or (block_name in output_blocks):
+ last_block = block_name != self._last_block_name
+ if last_block or (block_name == output_) or (block_name in output_blocks):
self._produce_block(block, block_name, context, output_variables, outputs)
# We already captured the output from this block
if block_name in output_blocks:
output_blocks.remove(block_name)
+ elif block_name == output_:
+ return context
# If there was an output_ but there are no pending
# outputs we are done.
- if output_ is not None and not output_blocks:
+ if output_variables is not None and not output_blocks:
if len(outputs) > 1:
return tuple(outputs)
else:
From 1a0eb099d177753d07797757bf0b3ae9a20de1f2 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 31 Oct 2019 12:59:46 -0400
Subject: [PATCH 070/160] Allow getting full context in partial outputs
---
mlblocks/mlpipeline.py | 61 +++++----
tests/features/test_partial_outputs.py | 7 +-
tests/test_mlpipeline.py | 175 +++++++++++++++++++------
3 files changed, 171 insertions(+), 72 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index d0d67f8f..21aa7ecc 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -4,6 +4,7 @@
import json
import logging
+import re
from collections import Counter, OrderedDict, defaultdict
from copy import deepcopy
@@ -198,12 +199,15 @@ def __init__(self, pipeline=None, primitives=None, init_params=None,
if hyperparameters:
self.set_hyperparameters(hyperparameters)
+ self._re_block_name = re.compile(r'(^[^#]+#\d+)(\..*)?')
+
def _get_str_output(self, output):
"""Get the outputs that correspond to the str specification."""
if output in self.outputs:
return self.outputs[output]
elif output in self.blocks:
- return self._get_block_outputs(output)
+ return [{'name': output, 'variable': output}]
+ # return self._get_block_outputs(output)
elif '.' in output:
block_name, variable_name = output.rsplit('.', 1)
block = self.blocks.get(block_name)
@@ -260,11 +264,11 @@ def get_outputs(self, outputs='default'):
computed = list()
for output in outputs:
+ if isinstance(output, int):
+ output = self._get_block_name(output)
+
if isinstance(output, str):
computed.extend(self._get_str_output(output))
- elif isinstance(output, int):
- block_name = self._get_block_name(output)
- computed.extend(self._get_block_outputs(block_name))
else:
raise TypeError('Output Specification can only be str or int')
@@ -316,6 +320,18 @@ def get_output_variables(self, outputs='default'):
outputs = self.get_outputs(outputs)
return [output['variable'] for output in outputs]
+ def _extract_block_name(self, variable_name):
+ return self._re_block_name.search(variable_name).group(1)
+
+ def _prepare_outputs(self, outputs):
+ output_variables = self.get_output_variables(outputs)
+ outputs = output_variables.copy()
+ output_blocks = {
+ self._extract_block_name(variable)
+ for variable in output_variables
+ }
+ return output_variables, outputs, output_blocks
+
@staticmethod
def _flatten_dict(hyperparameters):
return {
@@ -519,13 +535,11 @@ def _extract_outputs(self, block_name, outputs, block_outputs):
return output_dict
- def _update_outputs(self, block_name, output_variables, outputs, outputs_dict):
+ def _update_outputs(self, variable_name, output_variables, outputs, value):
"""Set the requested block outputs into the outputs list in the right place."""
- for key, value in outputs_dict.items():
- variable_name = '{}.{}'.format(block_name, key)
- if variable_name in output_variables:
- index = output_variables.index(variable_name)
- outputs[index] = deepcopy(value)
+ if variable_name in output_variables:
+ index = output_variables.index(variable_name)
+ outputs[index] = deepcopy(value)
def _fit_block(self, block, block_name, context):
"""Get the block args from the context and fit the block."""
@@ -554,7 +568,12 @@ def _produce_block(self, block, block_name, context, output_variables, outputs):
context.update(outputs_dict)
if output_variables:
- self._update_outputs(block_name, output_variables, outputs, outputs_dict)
+ if block_name in output_variables:
+ self._update_outputs(block_name, output_variables, outputs, context)
+ else:
+ for key, value in outputs_dict.items():
+ variable_name = '{}.{}'.format(block_name, key)
+ self._update_outputs(variable_name, output_variables, outputs, value)
except Exception:
if self.verbose:
@@ -609,17 +628,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
if y is not None:
context['y'] = y
- if isinstance(output_, str):
- output_variables = self.get_output_variables(output_)
- outputs = output_variables.copy()
- output_blocks = {variable.rsplit('.', 1)[0] for variable in output_variables}
- else:
+ if output_ is None:
output_variables = None
outputs = None
output_blocks = set()
-
- if isinstance(output_, int):
- output_ = self._get_block_name(output_)
+ else:
+ output_variables, outputs, output_blocks = self._prepare_outputs(output_)
if isinstance(start_, int):
start_ = self._get_block_name(start_)
@@ -634,15 +648,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
self._fit_block(block, block_name, context)
- last_block = block_name != self._last_block_name
- if last_block or (block_name == output_) or (block_name in output_blocks):
+ if (block_name != self._last_block_name) or (block_name in output_blocks):
self._produce_block(block, block_name, context, output_variables, outputs)
# We already captured the output from this block
if block_name in output_blocks:
output_blocks.remove(block_name)
- elif block_name == output_:
- return context
# If there was an output_ but there are no pending
# outputs we are done.
@@ -695,9 +706,7 @@ def predict(self, X=None, output_='default', start_=None, **kwargs):
if X is not None:
context['X'] = X
- output_variables = self.get_output_variables(output_)
- outputs = output_variables.copy()
- output_blocks = {variable.rsplit('.', 1)[0] for variable in output_variables}
+ output_variables, outputs, output_blocks = self._prepare_outputs(output_)
if isinstance(start_, int):
start_ = self._get_block_name(start_)
diff --git a/tests/features/test_partial_outputs.py b/tests/features/test_partial_outputs.py
index d31d2dd8..50739cea 100644
--- a/tests/features/test_partial_outputs.py
+++ b/tests/features/test_partial_outputs.py
@@ -70,13 +70,14 @@ def test_fit_output(self):
y = np.array([
0, 0, 0, 0, 1
])
+ context = {'X': X, 'y': y}
almost_equal(named_out, y)
assert len(list_out) == 2
almost_equal(list_out[0], y)
- almost_equal(list_out[1], X)
- almost_equal(X, int_out)
- almost_equal(X, str_out)
+ almost_equal(list_out[1], context)
+ almost_equal(context, int_out)
+ almost_equal(context, str_out)
almost_equal(X, str_out_variable)
assert no_output is None
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 7062e38e..f2edc36f 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -2,25 +2,36 @@
from collections import OrderedDict
from unittest import TestCase
-from unittest.mock import MagicMock, Mock, call, patch
+from unittest.mock import MagicMock, call, patch
+import pytest
+
+from mlblocks.mlblock import MLBlock
from mlblocks.mlpipeline import MLPipeline
+def get_mlblock_mock(*args, **kwargs):
+ return MagicMock(autospec=MLBlock)
+
+
class TestMLPipline(TestCase):
@patch('mlblocks.mlpipeline.LOGGER')
@patch('mlblocks.mlpipeline.MLBlock')
def test___init__(self, mlblock_mock, logger_mock):
- blocks = [Mock(), Mock(), Mock()]
- last_block = Mock()
+ blocks = [
+ get_mlblock_mock(),
+ get_mlblock_mock(),
+ get_mlblock_mock(),
+ get_mlblock_mock()
+ ]
+ last_block = blocks[-1]
last_block.produce_output = [
{
'name': 'y',
'type': 'array'
}
]
- blocks.append(last_block)
mlblock_mock.side_effect = blocks
primitives = [
@@ -93,7 +104,7 @@ def test___init__(self, mlblock_mock, logger_mock):
'a.primitive.Name'
)
- @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_get_tunable_hyperparameters(self):
mlpipeline = MLPipeline(['a_primitive'])
tunable = dict()
@@ -104,7 +115,7 @@ def test_get_tunable_hyperparameters(self):
assert returned == tunable
assert returned is not tunable
- @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_get_tunable_hyperparameters_flat(self):
mlpipeline = MLPipeline(['a_primitive'])
mlpipeline._tunable_hyperparameters = {
@@ -160,13 +171,13 @@ def test_get_tunable_hyperparameters_flat(self):
}
assert returned == expected
- @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_get_hyperparameters(self):
- block_1 = Mock()
+ block_1 = get_mlblock_mock()
block_1.get_hyperparameters.return_value = {
'a': 'a'
}
- block_2 = Mock()
+ block_2 = get_mlblock_mock()
block_2.get_hyperparameters.return_value = {
'b': 'b',
'c': 'c',
@@ -192,13 +203,13 @@ def test_get_hyperparameters(self):
block_1.get_hyperparameters.assert_called_once_with()
block_2.get_hyperparameters.assert_called_once_with()
- @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_get_hyperparameters_flat(self):
- block_1 = Mock()
+ block_1 = get_mlblock_mock()
block_1.get_hyperparameters.return_value = {
'a': 'a'
}
- block_2 = Mock()
+ block_2 = get_mlblock_mock()
block_2.get_hyperparameters.return_value = {
'b': 'b',
'c': 'c',
@@ -220,10 +231,10 @@ def test_get_hyperparameters_flat(self):
block_1.get_hyperparameters.assert_called_once_with()
block_2.get_hyperparameters.assert_called_once_with()
- @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_set_hyperparameters(self):
- block_1 = Mock()
- block_2 = Mock()
+ block_1 = get_mlblock_mock()
+ block_2 = get_mlblock_mock()
blocks = OrderedDict((
('a.primitive.Name#1', block_1),
('a.primitive.Name#2', block_2),
@@ -241,10 +252,10 @@ def test_set_hyperparameters(self):
block_1.set_hyperparameters.assert_not_called()
block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'})
- @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_set_hyperparameters_flat(self):
- block_1 = Mock()
- block_2 = Mock()
+ block_1 = get_mlblock_mock()
+ block_2 = get_mlblock_mock()
blocks = OrderedDict((
('a.primitive.Name#1', block_1),
('a.primitive.Name#2', block_2),
@@ -260,7 +271,7 @@ def test_set_hyperparameters_flat(self):
block_1.set_hyperparameters.assert_not_called()
block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'})
- @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test__get_block_args(self):
input_names = {
'a_block': {
@@ -298,9 +309,10 @@ def test__get_block_args(self):
}
assert args == expected
- @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test__get_outputs_no_outputs(self):
- self_ = Mock()
+ self_ = MagicMock(autospec=MLPipeline)
+
self_._last_block_name = 'last_block'
self_._get_block_outputs.return_value = ['some', 'outputs']
@@ -315,9 +327,9 @@ def test__get_outputs_no_outputs(self):
self_._get_block_outputs.assert_called_once_with('last_block')
- @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test__get_outputs_defaults(self):
- self_ = Mock()
+ self_ = MagicMock(autospec=MLPipeline)
pipeline = dict()
outputs = {
@@ -331,9 +343,9 @@ def test__get_outputs_defaults(self):
assert returned == expected
self_._get_block_outputs.assert_not_called()
- @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test__get_outputs_additional(self):
- self_ = Mock()
+ self_ = MagicMock(autospec=MLPipeline)
pipeline = {
'outputs': {
@@ -351,26 +363,90 @@ def test__get_outputs_additional(self):
assert returned == expected
self_._get_block_outputs.assert_not_called()
- def test_get_outputs_str(self):
- pass
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_get_outputs_str_named(self):
+ outputs = {
+ 'default': [
+ {
+ 'name': 'a_name',
+ 'variable': 'a_variable',
+ 'type': 'a_type',
+ }
+ ],
+ 'debug': [
+ {
+ 'name': 'another_name',
+ 'variable': 'another_variable',
+ }
+ ]
+ }
+ pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs)
+ returned = pipeline.get_outputs('debug')
+
+ expected = [
+ {
+ 'name': 'another_name',
+ 'variable': 'another_variable',
+ }
+ ]
+
+ assert returned == expected
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_get_outputs_str_variable(self):
+ pipeline = MLPipeline(['a_primitive', 'another_primitive'])
+
+ pipeline.blocks['a_primitive#1'].produce_output = [
+ {
+ 'name': 'output',
+ 'type': 'whatever'
+ }
+ ]
+
+ returned = pipeline.get_outputs('a_primitive#1.output')
+
+ expected = [
+ {
+ 'name': 'output',
+ 'type': 'whatever',
+ 'variable': 'a_primitive#1.output'
+ }
+ ]
+
+ assert returned == expected
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_get_outputs_str_block(self):
+ pipeline = MLPipeline(['a_primitive', 'another_primitive'])
+
+ returned = pipeline.get_outputs('a_primitive#1')
+
+ expected = [
+ {
+ 'name': 'a_primitive#1',
+ 'variable': 'a_primitive#1',
+ }
+ ]
+
+ assert returned == expected
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_get_outputs_int(self):
- pass
+ pipeline = MLPipeline(['a_primitive', 'another_primitive'])
- def test_get_outputs_list_of_str(self):
- pass
+ returned = pipeline.get_outputs(-1)
- def test_get_outputs_list_of_int(self):
- pass
+ expected = [
+ {
+ 'name': 'another_primitive#1',
+ 'variable': 'another_primitive#1',
+ }
+ ]
- def test_get_outputs_named_outputs(self):
- pass
+ assert returned == expected
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_get_outputs_combination(self):
- pass
-
- @patch('mlblocks.mlpipeline.MLBlock')
- def test_get_outputs_invalid(self, mlblock_mock):
outputs = {
'default': [
{
@@ -386,7 +462,6 @@ def test_get_outputs_invalid(self, mlblock_mock):
}
]
}
- mlblock_mock.side_effect = [MagicMock(), MagicMock()]
pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs)
pipeline.blocks['a_primitive#1'].produce_output = [
@@ -414,8 +489,8 @@ def test_get_outputs_invalid(self, mlblock_mock):
'variable': 'another_variable',
},
{
- 'name': 'something',
- 'variable': 'another_primitive#1.something',
+ 'name': 'another_primitive#1',
+ 'variable': 'another_primitive#1',
},
{
'name': 'output',
@@ -426,7 +501,21 @@ def test_get_outputs_invalid(self, mlblock_mock):
assert returned == expected
- @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_get_outputs_invalid(self):
+ pipeline = MLPipeline(['a_primitive'])
+
+ pipeline.blocks['a_primitive#1'].produce_output = [
+ {
+ 'name': 'output',
+ 'type': 'whatever'
+ }
+ ]
+
+ with pytest.raises(ValueError):
+ pipeline.get_outputs('a_primitive#1.invalid')
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_get_output_names(self):
outputs = {
'default': [
@@ -443,7 +532,7 @@ def test_get_output_names(self):
assert names == ['a_name']
- @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_get_output_variables(self):
outputs = {
'default': [
From 6019adfeff7f167dcea2d7ec2ffc9a7864c16fee Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 31 Oct 2019 15:26:35 -0400
Subject: [PATCH 071/160] =?UTF-8?q?Bump=20version:=200.3.4.dev1=20?=
=?UTF-8?q?=E2=86=92=200.3.4.dev2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 81b45593..936c210f 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.3.4.dev1'
+__version__ = '0.3.4.dev2'
__all__ = [
'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 0c2ea21a..58f63f5c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.3.4.dev1
+current_version = 0.3.4.dev2
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index da4bb6f3..60c97534 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/HDI-Project/MLBlocks',
- version='0.3.4.dev1',
+ version='0.3.4.dev2',
zip_safe=False,
)
From b7baf968b4be9d1b59384b13c9c55d5d5da3299e Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 4 Nov 2019 10:05:26 -0500
Subject: [PATCH 072/160] Release notes for v0.3.4
---
HISTORY.md | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/HISTORY.md b/HISTORY.md
index f3dc0a32..5b5d4f0b 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,12 @@
Changelog
=========
+0.3.4 - 2019-11-01
+------------------
+
+* Ability to return intermediate context - [Issue #110](https://github.com/HDI-Project/MLBlocks/issues/110) by @csala
+* Support for static or class methods - [Issue #107](https://github.com/HDI-Project/MLBlocks/issues/107) by @csala
+
0.3.3 - 2019-09-09
------------------
From b0cd3808f3291d9bd043362ff2e827ac626f8ef9 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 4 Nov 2019 10:05:27 -0500
Subject: [PATCH 073/160] =?UTF-8?q?Bump=20version:=200.3.4.dev2=20?=
=?UTF-8?q?=E2=86=92=200.3.4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 936c210f..e4aa9838 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.3.4.dev2'
+__version__ = '0.3.4'
__all__ = [
'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 58f63f5c..709511b4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.3.4.dev2
+current_version = 0.3.4
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index 60c97534..7b243501 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/HDI-Project/MLBlocks',
- version='0.3.4.dev2',
+ version='0.3.4',
zip_safe=False,
)
From 6ede62caed212b84067021fca2d3b29d187a8554 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 4 Nov 2019 10:05:40 -0500
Subject: [PATCH 074/160] =?UTF-8?q?Bump=20version:=200.3.4=20=E2=86=92=200?=
=?UTF-8?q?.3.5.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index e4aa9838..618e7a55 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.3.4'
+__version__ = '0.3.5.dev0'
__all__ = [
'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 709511b4..61208b1f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.3.4
+current_version = 0.3.5.dev0
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index 7b243501..09483fb3 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/HDI-Project/MLBlocks',
- version='0.3.4',
+ version='0.3.5.dev0',
zip_safe=False,
)
From 3ce7d89c3e81743c73400c0694ebb6e893acbc51 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 12 Dec 2019 15:38:54 +0100
Subject: [PATCH 075/160] Update paper references
---
README.md | 31 +++++++++++++++++++++++++++----
1 file changed, 27 insertions(+), 4 deletions(-)
diff --git a/README.md b/README.md
index 19f740ed..7c152fa3 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ Pipelines and Primitives for Machine Learning and Data Science.
* Free software: MIT license
* Documentation: https://HDI-Project.github.io/MLBlocks
-- Homepage: https://github.com/HDI-Project/MLBlocks
+* Homepage: https://github.com/HDI-Project/MLBlocks
# MLBlocks
@@ -237,10 +237,33 @@ If you want to learn more about how to tune the pipeline hyperparameters, save a
the pipelines using JSON annotations or build complex multi-branched pipelines, please
check our [documentation](https://HDI-Project.github.io/MLBlocks).
-# History
+## Citing MLBlocks
+
+If you use MLBlocks, please consider citing our related papers.
+
+For the current design of MLBlocks and its usage within the larger *Machine Learning Bazaar* project at
+the MIT Data To AI Lab, please see:
+
+Micah J. Smith, Carles Sala, James Max Kanter, and Kalyan Veeramachaneni. ["The Machine Learning Bazaar:
+Harnessing the ML Ecosystem for Effective System Development."](https://arxiv.org/abs/1905.08942) arXiv
+Preprint 1905.08942. 2019.
+
+``` bibtex
+@article{smith2019mlbazaar,
+ author = {Smith, Micah J. and Sala, Carles and Kanter, James Max and Veeramachaneni, Kalyan},
+ title = {The Machine Learning Bazaar: Harnessing the ML Ecosystem for Effective System Development},
+ journal = {arXiv e-prints},
+ year = {2019},
+ eid = {arXiv:1905.08942},
+ pages = {arXiv:1905.08942},
+ archivePrefix = {arXiv},
+ eprint = {1905.08942},
+}
+```
+
+For the first MLBlocks version from 2015, designed for only multi table, multi entity temporal data, please
+refer to Bryan Collazo’s thesis:
-In its first iteration in 2015, MLBlocks was designed for only multi table, multi entity temporal
-data. A good reference to see our design rationale at that time is Bryan Collazo’s thesis:
* [Machine learning blocks](https://dai.lids.mit.edu/wp-content/uploads/2018/06/Mlblocks_Bryan.pdf).
Bryan Collazo. Masters thesis, MIT EECS, 2015.
From 949c8b1d36abe3792e38bed3501645fde279a075 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 12 Dec 2019 15:53:36 +0100
Subject: [PATCH 076/160] Restrict dependency versions
---
setup.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/setup.py b/setup.py
index 09483fb3..1e8ef2ad 100644
--- a/setup.py
+++ b/setup.py
@@ -25,6 +25,7 @@
'urllib3>=1.20,<1.25',
'setuptools>=41.0.0',
'numpy<1.17',
+ 'python-dateutil<2.8.1,>=2.1',
]
From 6b8381a069e235d8083a02cac0e72550db3955e2 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 19 Dec 2019 19:18:09 +0100
Subject: [PATCH 077/160] Allow loading from json. Deprecate old methods
---
mlblocks/discovery.py | 44 ++++++++++++++----------------------
mlblocks/mlpipeline.py | 11 +++++++++
tests/test_discovery.py | 49 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 77 insertions(+), 27 deletions(-)
diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index 9a1dbef5..24a469da 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -198,6 +198,12 @@ def get_pipelines_paths():
return _PIPELINES_PATHS + _load_entry_points('pipelines')
+def _load_json(json_path):
+ with open(json_path, 'r') as json_file:
+ LOGGER.debug('Loading %s', json_path)
+ return json.load(json_file)
+
+
def _load(name, paths):
"""Locate and load the JSON annotation in any of the given paths.
@@ -206,8 +212,7 @@ def _load(name, paths):
Args:
name (str):
- name of the JSON to look for. The name should not contain the
- ``.json`` extension, as it will be added dynamically.
+ Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension.
paths (list):
list of paths where the primitives will be looked for.
@@ -215,6 +220,9 @@ def _load(name, paths):
dict:
The content of the JSON annotation file loaded into a dict.
"""
+ if os.path.isfile(name):
+ return _load_json(name)
+
for base_path in paths:
parts = name.split('.')
number_of_parts = len(parts)
@@ -225,12 +233,7 @@ def _load(name, paths):
json_path = os.path.join(folder, filename)
if os.path.isfile(json_path):
- with open(json_path, 'r') as json_file:
- LOGGER.debug('Loading %s from %s', name, json_path)
- return json.load(json_file)
-
-
-_PRIMITIVES = dict()
+ return _load_json(json_path)
def load_primitive(name):
@@ -241,8 +244,7 @@ def load_primitive(name):
Args:
name (str):
- name of the JSON to look for. The name should not contain the
- ``.json`` extension, as it will be added dynamically.
+ Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension.
Returns:
dict:
@@ -252,20 +254,13 @@ def load_primitive(name):
ValueError:
A ``ValueError`` will be raised if the primitive cannot be found.
"""
- primitive = _PRIMITIVES.get(name)
+ primitive = _load(name, get_primitives_paths())
if primitive is None:
- primitive = _load(name, get_primitives_paths())
- if primitive is None:
- raise ValueError("Unknown primitive: {}".format(name))
-
- _PRIMITIVES[name] = primitive
+ raise ValueError("Unknown primitive: {}".format(name))
return primitive
-_PIPELINES = dict()
-
-
def load_pipeline(name):
"""Locate and load the pipeline JSON annotation.
@@ -274,8 +269,7 @@ def load_pipeline(name):
Args:
name (str):
- name of the JSON to look for. The name should not contain the
- ``.json`` extension, as it will be added dynamically.
+ Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension.
Returns:
dict:
@@ -285,13 +279,9 @@ def load_pipeline(name):
ValueError:
A ``ValueError`` will be raised if the pipeline cannot be found.
"""
- pipeline = _PIPELINES.get(name)
+ pipeline = _load(name, get_pipelines_paths())
if pipeline is None:
- pipeline = _load(name, get_pipelines_paths())
- if pipeline is None:
- raise ValueError("Unknown pipeline: {}".format(name))
-
- _PIPELINES[name] = pipeline
+ raise ValueError("Unknown pipeline: {}".format(name))
return pipeline
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 21aa7ecc..962d7c19 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -5,6 +5,7 @@
import json
import logging
import re
+import warnings
from collections import Counter, OrderedDict, defaultdict
from copy import deepcopy
@@ -814,6 +815,11 @@ def from_dict(cls, metadata):
A new MLPipeline instance with the details found in the
given specification dictionary.
"""
+ warnings.warn(
+ 'MLPipeline.form_dict(pipeline_dict) is deprecated and will be removed in a '
+ 'later release. Please use MLPipeline(dict) instead,',
+ DeprecationWarning
+ )
return cls(metadata)
@classmethod
@@ -831,6 +837,11 @@ def load(cls, path):
A new MLPipeline instance with the specification found
in the JSON file.
"""
+ warnings.warn(
+ 'MLPipeline.load(path) is deprecated and will be removed in a later release. '
+ 'Please use MLPipeline(path) instead,',
+ DeprecationWarning
+ )
with open(path, 'r') as in_file:
metadata = json.load(in_file)
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index dc3eca87..a11fc02c 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -162,6 +162,55 @@ def test__load_success():
assert primitive == loaded
+def test__load_json_path():
+ primitive = {
+ 'name': 'temp.primitive',
+ 'primitive': 'temp.primitive'
+ }
+
+ with tempfile.TemporaryDirectory() as tempdir:
+ paths = [tempdir]
+ primitive_path = os.path.join(tempdir, 'temp.primitive.json')
+ with open(primitive_path, 'w') as primitive_file:
+ json.dump(primitive, primitive_file, indent=4)
+
+ loaded = discovery._load(primitive_path, paths)
+
+ assert primitive == loaded
+
+
+def _load(name, paths):
+ """Locate and load the JSON annotation in any of the given paths.
+
+ All the given paths will be scanned to find a JSON file with the given name,
+ and as soon as a JSON with the given name is found it is returned.
+
+ Args:
+ name (str):
+ Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension.
+ paths (list):
+ list of paths where the primitives will be looked for.
+
+ Returns:
+ dict:
+ The content of the JSON annotation file loaded into a dict.
+ """
+ if os.path.isfile(name):
+ return _load_json(name)
+
+ for base_path in paths:
+ parts = name.split('.')
+ number_of_parts = len(parts)
+
+ for folder_parts in range(number_of_parts):
+ folder = os.path.join(base_path, *parts[:folder_parts])
+ filename = '.'.join(parts[folder_parts:]) + '.json'
+ json_path = os.path.join(folder, filename)
+
+ if os.path.isfile(json_path):
+ return _load_json(json_path)
+
+
@patch('mlblocks.discovery.get_primitives_paths')
@patch('mlblocks.discovery._load')
From be684dd593f89cd21bd74efb53d6aa97b8c02970 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 19 Dec 2019 19:26:05 +0100
Subject: [PATCH 078/160] Remove unneeded code
---
tests/test_discovery.py | 34 +---------------------------------
1 file changed, 1 insertion(+), 33 deletions(-)
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index a11fc02c..25e6e444 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -162,6 +162,7 @@ def test__load_success():
assert primitive == loaded
+
def test__load_json_path():
primitive = {
'name': 'temp.primitive',
@@ -179,39 +180,6 @@ def test__load_json_path():
assert primitive == loaded
-def _load(name, paths):
- """Locate and load the JSON annotation in any of the given paths.
-
- All the given paths will be scanned to find a JSON file with the given name,
- and as soon as a JSON with the given name is found it is returned.
-
- Args:
- name (str):
- Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension.
- paths (list):
- list of paths where the primitives will be looked for.
-
- Returns:
- dict:
- The content of the JSON annotation file loaded into a dict.
- """
- if os.path.isfile(name):
- return _load_json(name)
-
- for base_path in paths:
- parts = name.split('.')
- number_of_parts = len(parts)
-
- for folder_parts in range(number_of_parts):
- folder = os.path.join(base_path, *parts[:folder_parts])
- filename = '.'.join(parts[folder_parts:]) + '.json'
- json_path = os.path.join(folder, filename)
-
- if os.path.isfile(json_path):
- return _load_json(json_path)
-
-
-
@patch('mlblocks.discovery.get_primitives_paths')
@patch('mlblocks.discovery._load')
def test__load_primitive_value_error(load_mock, gpp_mock):
From 1920227548edbb11b851b1864044cabc577b8e03 Mon Sep 17 00:00:00 2001
From: Erica Chiu
Date: Thu, 9 Jan 2020 11:41:23 -0500
Subject: [PATCH 079/160] Add get_inputs function
---
mlblocks/mlpipeline.py | 78 ++++++++++++++++++++++++++++
tests/test_mlpipeline.py | 107 +++++++++++++++++++++++++++++++++++++++
2 files changed, 185 insertions(+)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 21aa7ecc..dce30cfe 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -154,6 +154,45 @@ def _get_block_outputs(self, block_name):
return outputs
+ def _get_block_outputs_dict(self, block_name):
+ """Get dictionary of output variables for the given block."""
+ block = self.blocks[block_name]
+ outputs = deepcopy(block.produce_output)
+ output_names = self.output_names.get(block_name, dict())
+ output_dict = {}
+ for output in outputs:
+ name = output['name']
+ context_name = output_names.get(name, name)
+ output_dict[context_name] = output
+
+ return output_dict
+
+ def _get_block_inputs_dict(self, block_name):
+ """Get dictionary of input variables for the given block."""
+ block = self.blocks[block_name]
+ print(block.produce_args)
+ inputs = deepcopy(block.produce_args)
+ input_names = self.input_names.get(block_name, dict())
+ inputs_dict = {}
+ for input_value in inputs:
+ name = input_value['name']
+ context_name = input_names.get(name, name)
+ inputs_dict[context_name] = input_value
+ return inputs_dict
+
+ def _get_block_fit_inputs_dict(self, block_name):
+ """Get the list of fit input variables for the given block."""
+ block = self.blocks[block_name]
+ fit_inputs = deepcopy(block.fit_args)
+ input_names = self.input_names.get(block_name, dict())
+ fit_inputs_dict = {}
+ for fit_input in fit_inputs:
+ name = fit_input['name']
+ context_name = input_names.get(name, name)
+ fit_inputs_dict[context_name] = fit_input
+
+ return fit_inputs_dict
+
def _get_outputs(self, pipeline, outputs):
"""Get the output definitions from the pipeline dictionary.
@@ -224,6 +263,45 @@ def _get_str_output(self, output):
raise ValueError('Invalid Output Specification: {}'.format(output))
+ def get_inputs(self, fit=True):
+ """Get a dictionary mapping all input variable names required by the
+ pipeline to a dictionary with their specified information.
+
+ Can be specified to include fit arguments.
+
+ Args:
+ fit (bool):
+ Optional argument to include fit arguments or not. Defaults to ``True``.
+
+ Returns:
+ dictionary:
+ A dictionary mapping every input variable's name to a dictionary
+ specifying the information corresponding to that input variable.
+ Each dictionary contains the entry ``name``, as
+ well as any other metadata that may have been included in the
+ pipeline inputs specification.
+
+ Raises:
+ ValueError:
+ If an input specification is not valid.
+ TypeError:
+ If the type of a specification is not an str or an int.
+ """
+ inputs = dict()
+ for block_name in reversed(self.blocks.keys()): # iterates through pipeline backwards
+ produce_outputs = self._get_block_outputs_dict(block_name)
+ for produce_output_name in produce_outputs.keys():
+ inputs.pop(produce_output_name, None)
+
+ produce_inputs = self._get_block_inputs_dict(block_name)
+ inputs.update(produce_inputs)
+
+ if fit:
+ fit_inputs = self._get_block_fit_inputs_dict(block_name)
+ inputs.update(fit_inputs)
+
+ return inputs
+
def get_outputs(self, outputs='default'):
"""Get the list of output variables that correspond to the specified outputs.
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index f2edc36f..88cb8c44 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -549,6 +549,113 @@ def test_get_output_variables(self):
assert names == ['a_variable']
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_get_inputs_fit(self):
+ expected = {
+ 'input': {
+ 'name': 'input',
+ 'type': 'whatever',
+ },
+ 'fit_input': {
+ 'name': 'fit_input',
+ 'type': 'whatever',
+ },
+ 'another_input': {
+ 'name': 'another_input',
+ 'type': 'another_whatever',
+ }
+
+ }
+
+ pipeline = MLPipeline(['a_primitive', 'another_primitive'])
+
+ pipeline.blocks['a_primitive#1'].produce_args = [
+ {
+ 'name': 'input',
+ 'type': 'whatever'
+ }
+ ]
+
+ pipeline.blocks['a_primitive#1'].fit_args = [
+ {
+ 'name': 'fit_input',
+ 'type': 'whatever'
+ }
+ ]
+
+ pipeline.blocks['a_primitive#1'].produce_output = [
+ {
+ 'name': 'output',
+ 'type': 'another_whatever'
+ }
+ ]
+
+ pipeline.blocks['another_primitive#1'].produce_args = [
+ {
+ 'name': 'output',
+ 'type': 'another_whatever'
+ },
+ {
+ 'name': 'another_input',
+ 'type': 'another_whatever'
+ }
+ ]
+
+ inputs = pipeline.get_inputs()
+ assert inputs == expected
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_get_inputs_no_fit(self):
+ expected = {
+ 'input': {
+ 'name': 'input',
+ 'type': 'whatever',
+ },
+ 'another_input': {
+ 'name': 'another_input',
+ 'type': 'another_whatever',
+ }
+
+ }
+
+ pipeline = MLPipeline(['a_primitive', 'another_primitive'])
+
+ pipeline.blocks['a_primitive#1'].produce_args = [
+ {
+ 'name': 'input',
+ 'type': 'whatever'
+ }
+ ]
+
+ pipeline.blocks['a_primitive#1'].fit_args = [
+ {
+ 'name': 'fit_input',
+ 'type': 'whatever'
+ }
+ ]
+
+ pipeline.blocks['a_primitive#1'].produce_output = [
+ {
+ 'name': 'output',
+ 'type': 'another_whatever'
+ }
+ ]
+
+ pipeline.blocks['another_primitive#1'].produce_args = [
+ {
+ 'name': 'output',
+ 'type': 'another_whatever'
+ },
+ {
+ 'name': 'another_input',
+ 'type': 'another_whatever'
+ }
+ ]
+
+ inputs = pipeline.get_inputs(fit=False)
+
+ assert inputs == expected
+
def test_fit(self):
pass
From 0d2108f00b5daa62aa37b6ce715ac7ea01bc0b3f Mon Sep 17 00:00:00 2001
From: Erica Chiu
Date: Thu, 9 Jan 2020 11:46:21 -0500
Subject: [PATCH 080/160] Remove incorrect docstring
---
mlblocks/mlpipeline.py | 6 ------
1 file changed, 6 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index dce30cfe..7f23bf28 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -280,12 +280,6 @@ def get_inputs(self, fit=True):
Each dictionary contains the entry ``name``, as
well as any other metadata that may have been included in the
pipeline inputs specification.
-
- Raises:
- ValueError:
- If an input specification is not valid.
- TypeError:
- If the type of a specification is not an str or an int.
"""
inputs = dict()
for block_name in reversed(self.blocks.keys()): # iterates through pipeline backwards
From 4f456bdc3c5cb7200d0ca5e36a0ba05ec1e68e9f Mon Sep 17 00:00:00 2001
From: Erica Chiu
Date: Mon, 13 Jan 2020 17:09:40 -0500
Subject: [PATCH 081/160] Address comments
---
mlblocks/mlpipeline.py | 85 +++++++++++++++++-----------------------
tests/test_mlpipeline.py | 23 +++++++++++
2 files changed, 60 insertions(+), 48 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 7f23bf28..fbd5bcf0 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -144,54 +144,35 @@ def _get_pipeline_dict(pipeline, primitives):
def _get_block_outputs(self, block_name):
"""Get the list of output variables for the given block."""
- block = self.blocks[block_name]
- outputs = deepcopy(block.produce_output)
- output_names = self.output_names.get(block_name, dict())
- for output in outputs:
- name = output['name']
- context_name = output_names.get(name, name)
+ outputs = self._get_block_variables(block_name,
+ 'produce_output',
+ self.output_names.get(block_name, dict()))
+ for context_name, output in outputs.items():
output['variable'] = '{}.{}'.format(block_name, context_name)
- return outputs
-
- def _get_block_outputs_dict(self, block_name):
- """Get dictionary of output variables for the given block."""
- block = self.blocks[block_name]
- outputs = deepcopy(block.produce_output)
- output_names = self.output_names.get(block_name, dict())
- output_dict = {}
- for output in outputs:
- name = output['name']
- context_name = output_names.get(name, name)
- output_dict[context_name] = output
+ return list(outputs.values())
- return output_dict
+ def _get_block_variables(self, block_name, variables_attr, names):
+ """Get dictionary of variable names to the variable for a given block
- def _get_block_inputs_dict(self, block_name):
- """Get dictionary of input variables for the given block."""
- block = self.blocks[block_name]
- print(block.produce_args)
- inputs = deepcopy(block.produce_args)
- input_names = self.input_names.get(block_name, dict())
- inputs_dict = {}
- for input_value in inputs:
- name = input_value['name']
- context_name = input_names.get(name, name)
- inputs_dict[context_name] = input_value
- return inputs_dict
-
- def _get_block_fit_inputs_dict(self, block_name):
- """Get the list of fit input variables for the given block."""
+ Args:
+ block_name (str):
+ Name of the block for which to get the specification
+ variables_attr (str):
+ Name of the attribute that has the variables list. It can be
+ `fit_args`, `produce_args` or `produce_output`.
+ names (dict):
+ Dictionary used to translate the variable names.
+ """
block = self.blocks[block_name]
- fit_inputs = deepcopy(block.fit_args)
- input_names = self.input_names.get(block_name, dict())
- fit_inputs_dict = {}
- for fit_input in fit_inputs:
- name = fit_input['name']
- context_name = input_names.get(name, name)
- fit_inputs_dict[context_name] = fit_input
+ variables = deepcopy(getattr(block, variables_attr))
+ variable_dict = {}
+ for variable in variables:
+ name = variable['name']
+ context_name = names.get(name, name)
+ variable_dict[context_name] = variable
- return fit_inputs_dict
+ return variable_dict
def _get_outputs(self, pipeline, outputs):
"""Get the output definitions from the pipeline dictionary.
@@ -264,10 +245,11 @@ def _get_str_output(self, output):
raise ValueError('Invalid Output Specification: {}'.format(output))
def get_inputs(self, fit=True):
- """Get a dictionary mapping all input variable names required by the
- pipeline to a dictionary with their specified information.
+ """Get a relation of all the input variables required by this pipeline.
- Can be specified to include fit arguments.
+ The result is a dictionary that maps each variable name with their
+ specified information.
+ Optionally include the fit arguments.
Args:
fit (bool):
@@ -283,15 +265,22 @@ def get_inputs(self, fit=True):
"""
inputs = dict()
for block_name in reversed(self.blocks.keys()): # iterates through pipeline backwards
- produce_outputs = self._get_block_outputs_dict(block_name)
+ produce_outputs = self._get_block_variables(block_name,
+ 'produce_output',
+ self.output_names.get(block_name, dict()))
+
for produce_output_name in produce_outputs.keys():
inputs.pop(produce_output_name, None)
- produce_inputs = self._get_block_inputs_dict(block_name)
+ produce_inputs = self._get_block_variables(block_name,
+ 'produce_args',
+ self.input_names.get(block_name, dict()))
inputs.update(produce_inputs)
if fit:
- fit_inputs = self._get_block_fit_inputs_dict(block_name)
+ fit_inputs = self._get_block_variables(block_name,
+ 'fit_args',
+ self.input_names.get(block_name, dict()))
inputs.update(fit_inputs)
return inputs
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 88cb8c44..4fb779b8 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -549,6 +549,29 @@ def test_get_output_variables(self):
assert names == ['a_variable']
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test__get_block_variables(self):
+ expected = {
+ 'name_output': {
+ 'name': 'output',
+ 'type': 'whatever',
+ }
+ }
+
+ pipeline = MLPipeline(['a_primitive'])
+
+ pipeline.blocks['a_primitive#1'].produce_outputs = [
+ {
+ 'name': 'output',
+ 'type': 'whatever'
+ }
+ ]
+
+ outputs = pipeline._get_block_variables('a_primitive#1',
+ 'produce_outputs',
+ {'output': 'name_output'})
+ assert outputs == expected
+
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_get_inputs_fit(self):
expected = {
From 1dd0f372111a775a1d27b2c77641f7fa884a552f Mon Sep 17 00:00:00 2001
From: Erica Chiu
Date: Tue, 14 Jan 2020 10:19:50 -0500
Subject: [PATCH 082/160] Change indenting
---
AUTHORS.rst | 1 +
mlblocks/mlpipeline.py | 32 ++++++++++++++++++++------------
tests/test_mlpipeline.py | 8 +++++---
3 files changed, 26 insertions(+), 15 deletions(-)
diff --git a/AUTHORS.rst b/AUTHORS.rst
index eb8885c9..7245c735 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -10,3 +10,4 @@ Contributors
* William Xue
* Akshay Ravikumar
* Laura Gustafson
+* Erica Chiu
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index fbd5bcf0..35273642 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -144,9 +144,11 @@ def _get_pipeline_dict(pipeline, primitives):
def _get_block_outputs(self, block_name):
"""Get the list of output variables for the given block."""
- outputs = self._get_block_variables(block_name,
- 'produce_output',
- self.output_names.get(block_name, dict()))
+ outputs = self._get_block_variables(
+ block_name,
+ 'produce_output',
+ self.output_names.get(block_name, dict())
+ )
for context_name, output in outputs.items():
output['variable'] = '{}.{}'.format(block_name, context_name)
@@ -265,22 +267,28 @@ def get_inputs(self, fit=True):
"""
inputs = dict()
for block_name in reversed(self.blocks.keys()): # iterates through pipeline backwards
- produce_outputs = self._get_block_variables(block_name,
- 'produce_output',
- self.output_names.get(block_name, dict()))
+ produce_outputs = self._get_block_variables(
+ block_name,
+ 'produce_output',
+ self.output_names.get(block_name, dict())
+ )
for produce_output_name in produce_outputs.keys():
inputs.pop(produce_output_name, None)
- produce_inputs = self._get_block_variables(block_name,
- 'produce_args',
- self.input_names.get(block_name, dict()))
+ produce_inputs = self._get_block_variables(
+ block_name,
+ 'produce_args',
+ self.input_names.get(block_name, dict())
+ )
inputs.update(produce_inputs)
if fit:
- fit_inputs = self._get_block_variables(block_name,
- 'fit_args',
- self.input_names.get(block_name, dict()))
+ fit_inputs = self._get_block_variables(
+ block_name,
+ 'fit_args',
+ self.input_names.get(block_name, dict())
+ )
inputs.update(fit_inputs)
return inputs
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 4fb779b8..340a3838 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -567,9 +567,11 @@ def test__get_block_variables(self):
}
]
- outputs = pipeline._get_block_variables('a_primitive#1',
- 'produce_outputs',
- {'output': 'name_output'})
+ outputs = pipeline._get_block_variables(
+ 'a_primitive#1',
+ 'produce_outputs',
+ {'output': 'name_output'}
+ )
assert outputs == expected
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
From 93994e2a0c177fb8bab33f7fe57dd1eaae61a708 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 23 Jan 2020 20:43:15 +0100
Subject: [PATCH 083/160] Add notebook tutorials and examples
---
Makefile | 4 +
...ification.categorical_encoder.xgboost.json | 16 +
.../mlblocks.examples.ClassPrimitive.json | 104 ++
.../mlblocks.examples.function_primitive.json | 86 ++
.../tutorials/1. Using and MLPipeline.ipynb | 633 +++++++++++++
.../2. Finding and Loading a Pipeline.ipynb | 123 +++
.... Setting MLPipeline Hyperparameters.ipynb | 430 +++++++++
.../4. Saving and Loading a Pipeline.ipynb | 181 ++++
examples/tutorials/5. Tuning a Pipeline.ipynb | 463 +++++++++
...or the best pipeline with BTBSession.ipynb | 895 ++++++++++++++++++
setup.py | 7 +
11 files changed, 2942 insertions(+)
create mode 100644 examples/pipelines/single_table.classification.categorical_encoder.xgboost.json
create mode 100644 examples/primitives/mlblocks.examples.ClassPrimitive.json
create mode 100644 examples/primitives/mlblocks.examples.function_primitive.json
create mode 100644 examples/tutorials/1. Using and MLPipeline.ipynb
create mode 100644 examples/tutorials/2. Finding and Loading a Pipeline.ipynb
create mode 100644 examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
create mode 100644 examples/tutorials/4. Saving and Loading a Pipeline.ipynb
create mode 100644 examples/tutorials/5. Tuning a Pipeline.ipynb
create mode 100644 examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb
diff --git a/Makefile b/Makefile
index 6e8dd203..bfc1a5f6 100644
--- a/Makefile
+++ b/Makefile
@@ -72,6 +72,10 @@ clean: clean-build clean-pyc clean-test clean-coverage clean-docs ## remove all
install: clean-build clean-pyc ## install the package to the active Python's site-packages
pip install .
+.PHONY: install-examples
+install-examples: clean-build clean-pyc ## install the package and the examples dependencies
+ pip install .[examples]
+
.PHONY: install-test
install-test: clean-build clean-pyc ## install the package and test dependencies
pip install .[test]
diff --git a/examples/pipelines/single_table.classification.categorical_encoder.xgboost.json b/examples/pipelines/single_table.classification.categorical_encoder.xgboost.json
new file mode 100644
index 00000000..4dca4002
--- /dev/null
+++ b/examples/pipelines/single_table.classification.categorical_encoder.xgboost.json
@@ -0,0 +1,16 @@
+{
+ "metadata": {
+ "data_modality": "single_table",
+ "task_type": "classification"
+ },
+ "validation": {
+ "dataset": "census"
+ },
+ "primitives": [
+ "mlprimitives.custom.preprocessing.ClassEncoder",
+ "mlprimitives.custom.feature_extraction.CategoricalEncoder",
+ "sklearn.impute.SimpleImputer",
+ "xgboost.XGBClassifier",
+ "mlprimitives.custom.preprocessing.ClassDecoder"
+ ]
+}
diff --git a/examples/primitives/mlblocks.examples.ClassPrimitive.json b/examples/primitives/mlblocks.examples.ClassPrimitive.json
new file mode 100644
index 00000000..6c29e51e
--- /dev/null
+++ b/examples/primitives/mlblocks.examples.ClassPrimitive.json
@@ -0,0 +1,104 @@
+{
+ "name": "the_primitive_name",
+ "primitive": "full.python.path.to.AClass",
+ "fit": {
+ "method": "fit",
+ "args": [
+ {
+ "name": "X",
+ "keyword": "optional_name_of_the_fit_method_argument",
+ "description": "each input can be described",
+ "type": "pandas.DataFrame"
+ },
+ {
+ "name": "y",
+ "description": "each input can be described",
+ "default": "default_value_for_this_argument",
+ "type": "pandas.Series"
+ }
+ ]
+ },
+ "produce": {
+ "method": "predict",
+ "args": [
+ {
+ "name": "X",
+ "keyword": "optional_name_of_the_produce_method_argument",
+ "description": "each input can be described",
+ "type": "DataFrame"
+ }
+ ],
+ "output": [
+ {
+ "name": "y",
+ "descrtiption": "each output argument can be described",
+ "type": "Series"
+ }
+ ]
+ },
+ "hyperparameters": {
+ "fixed": {
+ "a_required_hyperparameter": {
+ "descrtiption": "this is a non tunable hyperparameter that needs to be specified by the user because it does not have a default value",
+ "type": "int"
+ },
+ "an_optional_hyperparameter": {
+ "descrtiption": "this is a non tunable hyperparameter that is optional because it has a default value",
+ "type": "int",
+ "default": 1
+ }
+ },
+ "tunable": {
+ "a_simple_range_hyperparameter": {
+ "description": "hyperparameter documentation can be put here",
+ "default": 1,
+ "type": "int",
+ "range": [1, 10]
+ },
+ "a_categorical_hyperparameter_of_type_int": {
+ "description": "Note that it has the field `values` instead of `range`",
+ "default": 1,
+ "type": "int",
+ "values": [1, 3, 7, 10]
+ },
+ "a_categorical_hyperparameter_of_type_str": {
+ "default": "a",
+ "type": "str",
+ "values": ["a", "b", "c"]
+ },
+ "a_multi_type_hyperprameter": {
+ "description": "this is a hyperparameter that allows more than one type",
+ "type": "multitype",
+ "default": "auto",
+ "types": {
+ "int": {
+ "description": "documentation can also be included here",
+ "range": [1, 10]
+ },
+ "string": {
+ "values": ["some", "string", "values"]
+ }
+ }
+ },
+ "conditional_hyperparameter": {
+ "description": "this is a hyperparameter whose valid values depend on the value of another hyperpameter",
+ "type": "conditional",
+ "condition": "the_name_of_the_other_hyperparameter",
+ "values": {
+ "a": {
+ "description": "this hyperparameter definition will be used if the value of the other hyperparameter is `a`",
+ "type": "int",
+ "default": 0,
+ "range": [0, 10]
+ },
+ "*": {
+ "description": "this will be used only if the value does not match any other definition",
+ "type": "float",
+ "default": 0.0,
+ "range": [0.0, 1.0]
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/examples/primitives/mlblocks.examples.function_primitive.json b/examples/primitives/mlblocks.examples.function_primitive.json
new file mode 100644
index 00000000..f3627bd9
--- /dev/null
+++ b/examples/primitives/mlblocks.examples.function_primitive.json
@@ -0,0 +1,86 @@
+{
+ "name": "the_primitive_name",
+ "primitive": "full.python.path.to.a_function",
+ "produce": {
+ "args": [
+ {
+ "name": "X",
+ "keyword": "optional_name_of_the_produce_method_argument",
+ "description": "each input can be described",
+ "type": "DataFrame"
+ }
+ ],
+ "output": [
+ {
+ "descrtiption": "each output argument can be described",
+ "name": "y",
+ "type": "Series"
+ }
+ ]
+ },
+ "hyperparameters": {
+ "fixed": {
+ "a_required_hyperparameter": {
+ "descrtiption": "this is a non tunable hyperparameter that needs to be specified by the user, because it does not have a default value",
+ "type": "int"
+ },
+ "an_optional_hyperparameter": {
+ "descrtiption": "this is a non tunable hyperparameter that is optional, because it has a default value",
+ "type": "int",
+ "default": 1
+ }
+ },
+ "tunable": {
+ "a_simple_range_hyperparameter": {
+ "description": "hyperparameter documentation can be put here",
+ "default": 1,
+ "type": "int",
+ "range": [1, 10]
+ },
+ "a_categorical_hyperparameter_of_type_int": {
+ "description": "Note that it has the filed `values` instead of `range`",
+ "default": 1,
+ "type": "int",
+ "values": [1, 3, 7, 10]
+ },
+ "a_categorical_hyperparameter_of_type_str": {
+ "default": "a",
+ "type": "str",
+ "values": ["a", "b", "c"]
+ },
+ "a_multi_type_hyperprameter": {
+ "description": "this is a hyperparameter that allows more than one type",
+ "type": "multitype",
+ "default": "auto",
+ "types": {
+ "int": {
+ "description": "documentation can also be included here",
+ "range": [1, 10]
+ },
+ "string": {
+ "values": ["some", "string", "values"]
+ }
+ }
+ },
+ "conditional_hyperparameter": {
+ "description": "this is a hyperparameter whose valid values depend on the value of another hyperpameter",
+ "type": "conditional",
+ "condition": "the_name_of_the_other_hyperparameter",
+ "values": {
+ "a": {
+ "description": "this hyperparameter definition will be used if the value of the other hyperparameter is `a`",
+ "type": "int",
+ "default": 0,
+ "range": [0, 10]
+ },
+ "*": {
+ "description": "this will be used only if the value does not match any other definition",
+ "type": "float",
+ "default": 0.0,
+ "range": [0.0, 1.0]
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/examples/tutorials/1. Using and MLPipeline.ipynb b/examples/tutorials/1. Using and MLPipeline.ipynb
new file mode 100644
index 00000000..733fb42d
--- /dev/null
+++ b/examples/tutorials/1. Using and MLPipeline.ipynb
@@ -0,0 +1,633 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Using an MLPipeline\n",
+ "\n",
+ "In this short guide we will go over the basic MLPipeline functionality.\n",
+ "\n",
+ "We will:\n",
+ "\n",
+ "1. Load a demo dataset.\n",
+ "2. Build a pipeline.\n",
+ "3. Explore the pipeline primitives, inputs and outputs.\n",
+ "4. Fit the pipeline to the dataset.\n",
+ "5. Make predictions using the fitted pipeline.\n",
+ "6. Evaluate the pipeline performance."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load the Dataset\n",
+ "\n",
+ "The first step will be to load the Census dataset using the function provided by mlprimitives"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlprimitives.datasets import load_dataset\n",
+ "\n",
+ "dataset = load_dataset('census')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This version of the Census dataset is prepared as a Classification (Supervised) Problem,\n",
+ "and has an input matrix `X` and an expected outcome `y` array."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Adult Census dataset.\n",
+ "\n",
+ " Predict whether income exceeds $50K/yr based on census data. Also known as \"Adult\" dataset.\n",
+ "\n",
+ " Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean\n",
+ " records was extracted using the following conditions: ((AAGE>16) && (AGI>100) &&\n",
+ " (AFNLWGT>1)&& (HRSWK>0))\n",
+ "\n",
+ " Prediction task is to determine whether a person makes over 50K a year.\n",
+ "\n",
+ " source: \"UCI\n",
+ " sourceURI: \"https://archive.ics.uci.edu/ml/datasets/census+income/"\n",
+ " \n",
+ "Data Modality: single_table\n",
+ "Task Type: classification\n",
+ "Task Subtype: binary\n",
+ "Data shape: (32561, 14)\n",
+ "Target shape: (32561,)\n",
+ "Metric: accuracy_score\n",
+ "Extras: \n"
+ ]
+ }
+ ],
+ "source": [
+ "dataset.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The data from the dataset can explored by looking at its `.data` and `.target` attributes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " age | \n",
+ " workclass | \n",
+ " fnlwgt | \n",
+ " education | \n",
+ " education-num | \n",
+ " marital-status | \n",
+ " occupation | \n",
+ " relationship | \n",
+ " race | \n",
+ " sex | \n",
+ " capital-gain | \n",
+ " capital-loss | \n",
+ " hours-per-week | \n",
+ " native-country | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 39 | \n",
+ " State-gov | \n",
+ " 77516 | \n",
+ " Bachelors | \n",
+ " 13 | \n",
+ " Never-married | \n",
+ " Adm-clerical | \n",
+ " Not-in-family | \n",
+ " White | \n",
+ " Male | \n",
+ " 2174 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 50 | \n",
+ " Self-emp-not-inc | \n",
+ " 83311 | \n",
+ " Bachelors | \n",
+ " 13 | \n",
+ " Married-civ-spouse | \n",
+ " Exec-managerial | \n",
+ " Husband | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 13 | \n",
+ " United-States | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 38 | \n",
+ " Private | \n",
+ " 215646 | \n",
+ " HS-grad | \n",
+ " 9 | \n",
+ " Divorced | \n",
+ " Handlers-cleaners | \n",
+ " Not-in-family | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 53 | \n",
+ " Private | \n",
+ " 234721 | \n",
+ " 11th | \n",
+ " 7 | \n",
+ " Married-civ-spouse | \n",
+ " Handlers-cleaners | \n",
+ " Husband | \n",
+ " Black | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 28 | \n",
+ " Private | \n",
+ " 338409 | \n",
+ " Bachelors | \n",
+ " 13 | \n",
+ " Married-civ-spouse | \n",
+ " Prof-specialty | \n",
+ " Wife | \n",
+ " Black | \n",
+ " Female | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " Cuba | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " age workclass fnlwgt education education-num \\\n",
+ "0 39 State-gov 77516 Bachelors 13 \n",
+ "1 50 Self-emp-not-inc 83311 Bachelors 13 \n",
+ "2 38 Private 215646 HS-grad 9 \n",
+ "3 53 Private 234721 11th 7 \n",
+ "4 28 Private 338409 Bachelors 13 \n",
+ "\n",
+ " marital-status occupation relationship race sex \\\n",
+ "0 Never-married Adm-clerical Not-in-family White Male \n",
+ "1 Married-civ-spouse Exec-managerial Husband White Male \n",
+ "2 Divorced Handlers-cleaners Not-in-family White Male \n",
+ "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n",
+ "4 Married-civ-spouse Prof-specialty Wife Black Female \n",
+ "\n",
+ " capital-gain capital-loss hours-per-week native-country \n",
+ "0 2174 0 40 United-States \n",
+ "1 0 0 13 United-States \n",
+ "2 0 0 40 United-States \n",
+ "3 0 0 40 United-States \n",
+ "4 0 0 40 Cuba "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset.data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K'], dtype=object)"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset.target[0:5]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The dataset data can also be splitted in multipe parts for cross validation using the `dataset.get_splits` method.\n",
+ "\n",
+ "For this demo we will be making only one split, which is equivalent to a simple train/test holdout partitioning."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train, X_test, y_train, y_test = dataset.get_splits(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(24420, 14)"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(8141, 14)"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_test.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Build a pipeline\n",
+ "\n",
+ "Once we have the dataset we will build a pipeline that works with it.\n",
+ "\n",
+ "In this case, we will be creating a short pipeline that uses the following primitives:\n",
+ "\n",
+ "- `ClassEncoder` from `mlprimitives`, which encodes the target variable `y` as integers.\n",
+ "- `CategoricaEncoder` from `mlprimitives`, which encodes all the categorical variables from the feature matrix `X`\n",
+ " using one-hot encoding.\n",
+ "- `SimpleImputer` from `sklearn`, which imputes any null values that may exist in the feature matrix `X`\n",
+ "- `XGBClassifier` from `xgboost`, which learns to predict the target variable `y` sing the feature matrix `X`.\n",
+ "- `ClassDecoder` from `mlprimitives`, which reverts the `ClassEncoder` transformation to return the original\n",
+ " target labels instead of integers."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlblocks import MLPipeline\n",
+ "\n",
+ "primitives = [\n",
+ " 'mlprimitives.custom.preprocessing.ClassEncoder',\n",
+ " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
+ " 'sklearn.impute.SimpleImputer',\n",
+ " 'xgboost.XGBClassifier',\n",
+ " 'mlprimitives.custom.preprocessing.ClassDecoder'\n",
+ "]\n",
+ "pipeline = MLPipeline(primitives)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Explore the Pipeline"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Primitives"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can see the primitives included in this pipeline by having a look at its `primitives` attribute."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['mlprimitives.custom.preprocessing.ClassEncoder',\n",
+ " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
+ " 'sklearn.impute.SimpleImputer',\n",
+ " 'xgboost.XGBClassifier',\n",
+ " 'mlprimitives.custom.preprocessing.ClassDecoder']"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pipeline.primitives"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Inputs\n",
+ "\n",
+ "We can also see the inputs of the pipeline using the `get_inputs` method.\n",
+ "\n",
+ "This will traverse the pipeline execution graph and show all the variables that need to be\n",
+ "provided by the user in order to fit this pipeline."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'X': {'name': 'X', 'type': 'DataFrame'},\n",
+ " 'y': {'name': 'y', 'type': 'ndarray'}}"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pipeline.get_inputs()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Alternatively, we can pass the `fit=False` argument, which will give us the variables needed\n",
+ "in order to make predictions."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'X': {'name': 'X', 'type': 'DataFrame'},\n",
+ " 'y': {'name': 'y', 'default': None, 'type': 'ndarray'}}"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pipeline.get_inputs(fit=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Note how the `fit` method expects two variables `X` and `y`, while the `predict`\n",
+ "method only needs `X`, as the `y` variable has a default value of `None`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Outputs\n",
+ "\n",
+ "Equally, we can see the outputs that the pipeline will return when used to make predictions."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'name': 'y',\n",
+ " 'type': 'ndarray',\n",
+ " 'variable': 'mlprimitives.custom.preprocessing.ClassDecoder#1.y'}]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pipeline.get_outputs()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Fit the Pipeline to the Dataset\n",
+ "\n",
+ "Now that the pipeline is ready and we know its inputs and outputs, we can fit it to the\n",
+ "dataset by passing the training `X` and `y` variables to its `fit` method."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pipeline.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Make Predictions\n",
+ "\n",
+ "After the pipelines finished fitting, we can try to predict the `y_test` array values by\n",
+ "passing the `X_test` matrix to the `pipeline.predict` method."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "predictions = pipeline.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([' >50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "predictions[0:5]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Evaluating the pipeline performance\n",
+ "\n",
+ "Now we can compare the predicted array with the actual test array to see how well\n",
+ "our pipeline performed.\n",
+ "\n",
+ "This can be done using the `dataset.score` method, which provides a suitable scoring\n",
+ "function for this kind of data and problem.\n",
+ "In this case, the dataset is just computing the accuracy score."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.8602137329566393"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset.score(y_test, predictions)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb
new file mode 100644
index 00000000..a94c48bc
--- /dev/null
+++ b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb
@@ -0,0 +1,123 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Finding and Loading a Pipeline\n",
+ "\n",
+ "In this short tutorial we will show you how to search for pipelines suitable to solve\n",
+ "your prediction problem."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In order to find a suitable pipeline, the first thing we need is to identify\n",
+ "the type of problem (data modality + task type) that we are facing.\n",
+ "\n",
+ "This is a full list of current data modalities and task types that we cover:\n",
+ "\n",
+ "| Problem Type | Data Modality | Task Type |\n",
+ "|:-------------------------------------|:--------------|:------------------------|\n",
+ "| Single Table Classification | single_table | classification |\n",
+ "| Single Table Regression | single_table | regression |\n",
+ "| Single Table Collaborative Filtering | single_table | collaborative_filtering |\n",
+ "| Multi Table Classification | multi_table | classification |\n",
+ "| Multi Table Regression | multi_table | regression |\n",
+ "| Time Series Classification | timeseries | classification |\n",
+ "| Time Series Regression | timeseries | regression |\n",
+ "| Time Series Forecasting | timeseries | forecasting |\n",
+ "| Time Series Anomaly Detection | timeseries | anomaly_detection |\n",
+ "| Image Classification | image | classification |\n",
+ "| Image Regression | image | regression |\n",
+ "| Graph Link Prediction | graph | link_prediction |\n",
+ "| Graph Vertex Nomination | graph | vertex_nomination |\n",
+ "| Graph Community Detection | graph | community_detection |\n",
+ "| Graph Matching | graph | graph_matching |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Once we have identified our data modality and task type we can use the\n",
+ "`mlblocks.discovery.find_pipelines` function to find all the pipelines\n",
+ "that support this particular problem type.\n",
+ "\n",
+ "For example, if we are looking for a pipeline to work on Image Classification\n",
+ "we will do the following query."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['image.classification.hog.random_forest',\n",
+ " 'image.classification.hog.xgboost',\n",
+ " 'image.classification.resnet50.xgboost']"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from mlblocks.discovery import find_pipelines\n",
+ "\n",
+ "filters = {\n",
+ " 'metadata.data_modality': 'image',\n",
+ " 'metadata.task_type': 'classification',\n",
+ "}\n",
+ "\n",
+ "find_pipelines(filters=filters)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "After finding and choosing a pipeline, we can load it as an `MLPipeline`\n",
+ "by passing its name to the `MLPipeline`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlblocks import MLPipeline\n",
+ "\n",
+ "pipeline = MLPipeline('image.classification.resnet50.xgboost')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
new file mode 100644
index 00000000..29f60a8f
--- /dev/null
+++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
@@ -0,0 +1,430 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 2. Setting MLPipeline Hyperparameters\n",
+ "\n",
+ "In this short guide we will see how to modify the hyperparameters\n",
+ "of an MLPipeline in order to modify its behavior or performance.\n",
+ "\n",
+ "Note that some steps are not explained for simplicity. Full details\n",
+ "about them can be found in the previous parts of the tutorial.\n",
+ "\n",
+ "We will:\n",
+ "\n",
+ "1. Load a dataset and a Pipeline.\n",
+ "2. Explore the pipeline hyperparamters.\n",
+ "3. Reload the pipeline with different hyperparameters.\n",
+ "4. Evaluate the pipeline performance on the dataset.\n",
+ "5. Set different pipeline hyperparameters.\n",
+ "6. Re-evaluate the pipeline performance on the dataset."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load the Dataset and the Pipeline\n",
+ "\n",
+ "The first step will be to load the dataset and the pipeline that we will be using."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlprimitives.datasets import load_dataset\n",
+ "\n",
+ "dataset = load_dataset('census')\n",
+ "X_train, X_test, y_train, y_test = dataset.get_splits(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlblocks import MLPipeline\n",
+ "\n",
+ "pipeline = MLPipeline('single_table.classification.categorical_encoder.xgboost')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Explore the Pipeline Hyperparameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Once we have loaded the pipeline, we can see the hyperparameters that it is using by\n",
+ "calling its `get_hyperparameters` method."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n",
+ " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n",
+ " 'copy': True,\n",
+ " 'features': 'auto',\n",
+ " 'max_unique_ratio': 0,\n",
+ " 'max_labels': 0},\n",
+ " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n",
+ " 'fill_value': None,\n",
+ " 'verbose': False,\n",
+ " 'copy': True,\n",
+ " 'strategy': 'mean'},\n",
+ " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n",
+ " 'n_estimators': 100,\n",
+ " 'max_depth': 3,\n",
+ " 'learning_rate': 0.1,\n",
+ " 'gamma': 0,\n",
+ " 'min_child_weight': 1},\n",
+ " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pipeline.get_hyperparameters()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This will return us a dictionary that contains one entry for each step in the pipeline.\n",
+ "Each entry will also be a dictionary, indicating the names and the values of the hyperparameters of that step.\n",
+ "\n",
+ "**NOTE** that here we see the names of the pipeline steps, which are the primitive names with a numerical suffix that allows us to tell the difference between multiple steps that use the same primitive. \n",
+ "\n",
+ "Alternatively, for better compatibility with tuning systems like [BTB](https://github.com/HDI-Project/BTB)\n",
+ "that work with flat, one-level, dictionaries, the argument `flat=True` can be passed."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+ " 'keep'): False,\n",
+ " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'copy'): True,\n",
+ " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+ " 'features'): 'auto',\n",
+ " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+ " 'max_unique_ratio'): 0,\n",
+ " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+ " 'max_labels'): 0,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'missing_values'): nan,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'fill_value'): None,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'verbose'): False,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'copy'): True,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+ " ('xgboost.XGBClassifier#1', 'n_jobs'): -1,\n",
+ " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
+ " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
+ " ('xgboost.XGBClassifier#1', 'gamma'): 0,\n",
+ " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pipeline.get_hyperparameters(flat=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This will return us the same information as before, but organized a single one-level\n",
+ "dictionary where each key is a `tuple` containing both the name of the step and the hyperparameter."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Setting Pipeline hyperparameter values\n",
+ "\n",
+ "We can set some different hyperparameter values when loading the pipeline by adding the\n",
+ "`init_params` argument to `MLPipeline`.\n",
+ "\n",
+ "The `init_params` has to be a dictionary where each entry corresponds to the name of one of the\n",
+ "pipeline steps and each value is another dictionary indicating the hyperparameter values that we\n",
+ "want to use on that step.\n",
+ "\n",
+ "As an example, we will set a different imputer strategy and a different xgboost max dempt."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "init_params = {\n",
+ " 'sklearn.impute.SimpleImputer#1': {\n",
+ " 'strategy': 'median'\n",
+ " },\n",
+ " 'xgboost.XGBClassifier#1': {\n",
+ " 'max_depth': 4\n",
+ " }\n",
+ "}\n",
+ "pipeline = MLPipeline(\n",
+ " 'single_table.classification.categorical_encoder.xgboost',\n",
+ " init_params=init_params\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can now see how the hyperparameters are different than before."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n",
+ " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n",
+ " 'copy': True,\n",
+ " 'features': 'auto',\n",
+ " 'max_unique_ratio': 0,\n",
+ " 'max_labels': 0},\n",
+ " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n",
+ " 'fill_value': None,\n",
+ " 'verbose': False,\n",
+ " 'copy': True,\n",
+ " 'strategy': 'median'},\n",
+ " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n",
+ " 'max_depth': 4,\n",
+ " 'n_estimators': 100,\n",
+ " 'learning_rate': 0.1,\n",
+ " 'gamma': 0,\n",
+ " 'min_child_weight': 1},\n",
+ " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pipeline.get_hyperparameters()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Evaluate the Pipeline performance\n",
+ "\n",
+ "We can now evaluate the pipeline performance to see what results these\n",
+ "hyperparameters produce."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.8647586291610367"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pipeline.fit(X_train, y_train)\n",
+ "y_pred = pipeline.predict(X_test)\n",
+ "\n",
+ "dataset.score(y_test, y_pred)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Setting hyperparameter values\n",
+ "\n",
+ "Another way of setting the pipeline hyperparameters without having to recreate it\n",
+ "from scratch, is to use its `set_hyperparameters` method.\n",
+ "\n",
+ "In this case, we will change the CategoricalEncoder `max_labels` and the xgboost `learning_rate`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "hyperparameters = {\n",
+ " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {\n",
+ " 'max_labels': 10\n",
+ " },\n",
+ " 'xgboost.XGBClassifier#1': {\n",
+ " 'learning_rate': 0.3\n",
+ " }\n",
+ "}\n",
+ "pipeline.set_hyperparameters(hyperparameters)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Alternatively, the hyperparameters can be set using the `flat` format:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "hyperparameters = {\n",
+ " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 10,\n",
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3\n",
+ "}\n",
+ "pipeline.set_hyperparameters(hyperparameters)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And we can see how these hyperparameters now are different than before:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n",
+ " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n",
+ " 'copy': True,\n",
+ " 'features': 'auto',\n",
+ " 'max_unique_ratio': 0,\n",
+ " 'max_labels': 10},\n",
+ " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n",
+ " 'fill_value': None,\n",
+ " 'verbose': False,\n",
+ " 'copy': True,\n",
+ " 'strategy': 'median'},\n",
+ " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n",
+ " 'max_depth': 4,\n",
+ " 'n_estimators': 100,\n",
+ " 'learning_rate': 0.3,\n",
+ " 'gamma': 0,\n",
+ " 'min_child_weight': 1},\n",
+ " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pipeline.get_hyperparameters()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Evaluate the Pipeline performance\n",
+ "\n",
+ "We can now evaluate again the pipeline performance and see how the hyperparameter\n",
+ "change affected the pipeline performance."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.870531875690947"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pipeline.fit(X_train, y_train)\n",
+ "y_pred = pipeline.predict(X_test)\n",
+ "\n",
+ "dataset.score(y_test, y_pred)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb
new file mode 100644
index 00000000..193daaf3
--- /dev/null
+++ b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb
@@ -0,0 +1,181 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Saving and Loading a Pipeline\n",
+ "\n",
+ "This short guide shows how serialize a Pipeline into a file and later on load it\n",
+ "to make predictions.\n",
+ "\n",
+ "Note that some steps are not explained for simplicity. Full details\n",
+ "about them can be found in the previous parts of the tutorial.\n",
+ "\n",
+ "We will:\n",
+ "\n",
+ "1. Load and fit a pipeline to a dataset\n",
+ "2. Save the pipeline to a file.\n",
+ "3. Load the pipeline as a new object.\n",
+ "4. Make predictions using the new pipeline object."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Fit the pipeline\n",
+ "\n",
+ "The first step will be to load and fit the pipeline to the dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlprimitives.datasets import load_dataset\n",
+ "\n",
+ "dataset = load_dataset('census')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train, X_test, y_train, y_test = dataset.get_splits(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlblocks import MLPipeline\n",
+ "\n",
+ "pipeline = MLPipeline('single_table.classification.categorical_encoder.xgboost')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pipeline.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Save the Pipeline\n",
+ "\n",
+ "Once the pipeline is fit and ready to make predictions we can store it in a file.\n",
+ "We will do so using [pickle](https://docs.python.org/3/library/pickle.html)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pickle\n",
+ "\n",
+ "with open('pipeline.pkl', 'wb') as f:\n",
+ " pickle.dump(pipeline, f)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load the Pipeline\n",
+ "\n",
+ "The saved pipeline can then be moved to another system where we can load it back to\n",
+ "memory using pickle again."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open('pipeline.pkl', 'rb') as f:\n",
+ " loaded_pipeline = pickle.load(f)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**IMPORTANT**: All the dependencies need to also be installed in the system that is loading the pipeline. This includes **MLBlocks** and **MLPrimitives** or any other libraries required by the pipeline primitives."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Make Predictions\n",
+ "\n",
+ "Once the pipeline is loaded it is ready to make predictions again"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pred = pipeline.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([' >50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pred[0:5]"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tutorials/5. Tuning a Pipeline.ipynb b/examples/tutorials/5. Tuning a Pipeline.ipynb
new file mode 100644
index 00000000..8dbc4366
--- /dev/null
+++ b/examples/tutorials/5. Tuning a Pipeline.ipynb
@@ -0,0 +1,463 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Tuning a Pipeline\n",
+ "\n",
+ "This short guide shows how tune a Pipeline using a [BTB](https://github.com/HDI-Project/BTB) Tuner.\n",
+ "\n",
+ "Note that some steps are not explained for simplicity. Full details\n",
+ "about them can be found in the previous parts of the tutorial.\n",
+ "\n",
+ "Here we will:\n",
+ "1. Load a dataset and a pipeline\n",
+ "2. Explore the pipeline tunable hyperparameters\n",
+ "3. Write a scoring function\n",
+ "4. Build a BTB Tunable and BTB Tuner.\n",
+ "5. Write a tuning loop"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load dataset and the pipeline\n",
+ "\n",
+ "The first step will be to load the dataset that we were using in previous tutorials."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlprimitives.datasets import load_dataset\n",
+ "\n",
+ "dataset = load_dataset('census')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And load a suitable pipeline.\n",
+ "\n",
+ "Note how in this case we are using the variable name `template` instead of `pipeline`,\n",
+ "because this will only be used as a template for the pipelines that we will create\n",
+ "and evaluate during the later tuning loop."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlblocks import MLPipeline\n",
+ "\n",
+ "template = MLPipeline('single_table.classification.categorical_encoder.xgboost')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Explore the pipeline tunable hyperparameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Once we have loaded the pipeline, we can now extract the hyperparameters that we will tune\n",
+ "by calling the `get_tunable_hyperparameters` method.\n",
+ "\n",
+ "In this case we will call it using `flat=True` to obtain the hyperparameters in a format\n",
+ "that is compatible with BTB."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tunable_hyperparameters = template.get_tunable_hyperparameters(flat=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+ " 'max_labels'): {'type': 'int', 'default': 0, 'range': [0, 100]},\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): {'type': 'str',\n",
+ " 'default': 'mean',\n",
+ " 'values': ['mean', 'median', 'most_frequent', 'constant']},\n",
+ " ('xgboost.XGBClassifier#1', 'n_estimators'): {'type': 'int',\n",
+ " 'default': 100,\n",
+ " 'range': [10, 1000]},\n",
+ " ('xgboost.XGBClassifier#1', 'max_depth'): {'type': 'int',\n",
+ " 'default': 3,\n",
+ " 'range': [3, 10]},\n",
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): {'type': 'float',\n",
+ " 'default': 0.1,\n",
+ " 'range': [0, 1]},\n",
+ " ('xgboost.XGBClassifier#1', 'gamma'): {'type': 'float',\n",
+ " 'default': 0,\n",
+ " 'range': [0, 1]},\n",
+ " ('xgboost.XGBClassifier#1', 'min_child_weight'): {'type': 'int',\n",
+ " 'default': 1,\n",
+ " 'range': [1, 10]}}"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tunable_hyperparameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Write a scoring function\n",
+ "\n",
+ "To tune the pipeline we will need to evaluate its performance multiple times with different hyperparameters.\n",
+ "\n",
+ "For this reason, we will start by writing a scoring function that will expect only one\n",
+ "input, the hyperparameters dictionary, and evaluate the performance of the pipeline using them.\n",
+ "\n",
+ "In this case, the evaluation will be done using 5-fold cross validation based on the `get_splits`\n",
+ "method from the dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "\n",
+ "def cross_validate(hyperparameters=None):\n",
+ " scores = []\n",
+ " for X_train, X_test, y_train, y_test in dataset.get_splits(5):\n",
+ " pipeline = MLPipeline(template.to_dict()) # Make a copy of the template\n",
+ " if hyperparameters:\n",
+ " pipeline.set_hyperparameters(hyperparameters)\n",
+ "\n",
+ " pipeline.fit(X_train, y_train)\n",
+ " y_pred = pipeline.predict(X_test)\n",
+ " \n",
+ " scores.append(dataset.score(y_test, y_pred))\n",
+ " \n",
+ " return np.mean(scores)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "By calling this function without any arguments we will obtain the score obtained\n",
+ "with the default hyperparameters."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.8639171383183359"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "default_score = cross_validate()\n",
+ "default_score"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Optionally, we can certify that by passing a hyperparameters dictionary the new hyperparameters\n",
+ "will be used, resulting on a different score."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.8686773872402614"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "hyperparameters = {\n",
+ " ('xgboost.XGBClassifier#1', 'max_depth'): 4\n",
+ "}\n",
+ "cross_validate(hyperparameters)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create a BTB Tunable\n",
+ "\n",
+ "The next step is to create the BTB Tunable instance that will be tuned by the BTB Tuner.\n",
+ "\n",
+ "For this we will use its `from_dict` method, passing our hyperparameters dict."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from btb.tuning import Tunable\n",
+ "\n",
+ "tunable = Tunable.from_dict(tunable_hyperparameters)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create the BTB Tuner\n",
+ "\n",
+ "After creating the Tunable, we need to create a Tuner to tune it.\n",
+ "\n",
+ "In this case we will use the GPTuner, a Meta-model based tuner that uses a Gaussian Process Regressor\n",
+ "for the optimization."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from btb.tuning import GPTuner\n",
+ "\n",
+ "tuner = GPTuner(tunable)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Optionally, since we already know the score obtained by the default arguments and\n",
+ "these have a high probability of being already decent, we will inform the tuner\n",
+ "about their performance.\n",
+ "\n",
+ "In order to obtain the default hyperparameters used before we can either call\n",
+ "the template `get_hyperparameters(flat=True)` method, the `tunable.get_defaults()`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+ " 'max_labels'): 0,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+ " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
+ " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
+ " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n",
+ " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "defaults = tunable.get_defaults()\n",
+ "defaults"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tuner.record(defaults, default_score)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Start the Tuning loop\n",
+ "\n",
+ "Once we have the tuner ready we can the tuning loop.\n",
+ "\n",
+ "During this loop we will:\n",
+ "\n",
+ "1. Ask the tuner for a new hyperparameter proposal\n",
+ "2. Run the `cross_validate` function to evaluate these hyperparameters\n",
+ "3. Record the obtained score back to the tuner.\n",
+ "4. If the obtained score is better than the previous one, store the proposal."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "scoring pipeline 1\n",
+ "scoring pipeline 2\n",
+ "New best found: 0.8722706212975673\n",
+ "scoring pipeline 3\n",
+ "scoring pipeline 4\n",
+ "scoring pipeline 5\n",
+ "scoring pipeline 6\n",
+ "scoring pipeline 7\n",
+ "scoring pipeline 8\n",
+ "scoring pipeline 9\n",
+ "scoring pipeline 10\n"
+ ]
+ }
+ ],
+ "source": [
+ "best_score = default_score\n",
+ "best_proposal = defaults\n",
+ "\n",
+ "for iteration in range(10):\n",
+ " print(\"scoring pipeline {}\".format(iteration + 1))\n",
+ " \n",
+ " proposal = tuner.propose()\n",
+ " score = cross_validate(proposal)\n",
+ " \n",
+ " tuner.record(proposal, score)\n",
+ " \n",
+ " if score > best_score:\n",
+ " print(\"New best found: {}\".format(score))\n",
+ " best_score = score\n",
+ " best_proposal = proposal"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "After the loop has finished, the best proposal will be stored in the `best_proposal` variable,\n",
+ "which can be used to generate a new pipeline instance."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+ " 'max_labels'): 40,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+ " ('xgboost.XGBClassifier#1', 'n_estimators'): 119,\n",
+ " ('xgboost.XGBClassifier#1', 'max_depth'): 4,\n",
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1971742459927317,\n",
+ " ('xgboost.XGBClassifier#1', 'gamma'): 0.22575517380871246,\n",
+ " ('xgboost.XGBClassifier#1', 'min_child_weight'): 4}"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "best_proposal"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "best_pipeline = MLPipeline(template.to_dict())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "best_pipeline.set_hyperparameters(best_proposal)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "best_pipeline.fit(dataset.data, dataset.target)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb
new file mode 100644
index 00000000..a1f0c0f4
--- /dev/null
+++ b/examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb
@@ -0,0 +1,895 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Selecting and Tuning pipelines\n",
+ "\n",
+ "This guide shows you how to search for multiple pipelines for your problem\n",
+ "and later on use a [BTBSession](https://hdi-project.github.io/BTB/api/btb.session.html#btb.session.BTBSession)\n",
+ "to select and tune the best one.\n",
+ "\n",
+ "Note that some steps are not explained for simplicity. Full details\n",
+ "about them can be found in the previous parts of the tutorial.\n",
+ "\n",
+ "Here we will:\n",
+ "\n",
+ "1. Load a dataset\n",
+ "2. Search and load suitable templates\n",
+ "3. Write a scoring function\n",
+ "4. Build a BTBSession for our templates\n",
+ "5. Run the session to find the best pipeline"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load the Dataset\n",
+ "\n",
+ "The first step will be to load the dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlprimitives.datasets import load_dataset\n",
+ "\n",
+ "dataset = load_dataset('census')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Adult Census dataset.\n",
+ "\n",
+ " Predict whether income exceeds $50K/yr based on census data. Also known as \"Adult\" dataset.\n",
+ "\n",
+ " Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean\n",
+ " records was extracted using the following conditions: ((AAGE>16) && (AGI>100) &&\n",
+ " (AFNLWGT>1)&& (HRSWK>0))\n",
+ "\n",
+ " Prediction task is to determine whether a person makes over 50K a year.\n",
+ "\n",
+ " source: \"UCI\n",
+ " sourceURI: \"https://archive.ics.uci.edu/ml/datasets/census+income/"\n",
+ " \n",
+ "Data Modality: single_table\n",
+ "Task Type: classification\n",
+ "Task Subtype: binary\n",
+ "Data shape: (32561, 14)\n",
+ "Target shape: (32561,)\n",
+ "Metric: accuracy_score\n",
+ "Extras: \n"
+ ]
+ }
+ ],
+ "source": [
+ "dataset.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Find and load suitable Templates"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We will be using the `mlblocks.discovery.find_pipelines` function to search\n",
+ "for compatible pipelines.\n",
+ "\n",
+ "In this case, we will be looking for `single_table/classification` pipelines."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlblocks.discovery import find_pipelines\n",
+ "\n",
+ "filters = {\n",
+ " 'metadata.data_modality': 'single_table',\n",
+ " 'metadata.task_type': 'classification'\n",
+ "}\n",
+ "templates = find_pipelines(filters=filters)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['single_table.classification.categorical_encoder.logit',\n",
+ " 'single_table.classification.categorical_encoder.random_forest',\n",
+ " 'single_table.classification.categorical_encoder.xgboost',\n",
+ " 'single_table.classification.mlprimitives.logit',\n",
+ " 'single_table.classification.mlprimitives.random_forest',\n",
+ " 'single_table.classification.mlprimitives.xgboost',\n",
+ " 'single_table.classification.mlprimitives_text.xgboost']"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "templates"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And we will create a dictionary with MLPipeline instances that will be used as tempaltes for our tuning."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlblocks import MLPipeline\n",
+ "\n",
+ "templates_dict = {\n",
+ " template: MLPipeline(template)\n",
+ " for template in templates\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "templates_dict['single_table.classification.mlprimitives.xgboost']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create a scoring function\n",
+ "\n",
+ "In order to use a `BTBSession` we will need a function that is able to score a proposal,\n",
+ "which will always be a pair of template name and proposed hyperparameters.\n",
+ "\n",
+ "In this case, the evaluation will be done using 5-fold cross validation over our dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "\n",
+ "def cross_validate(template_name, hyperparameters=None):\n",
+ " template = templates_dict[template_name]\n",
+ " scores = []\n",
+ " for X_train, X_test, y_train, y_test in dataset.get_splits(5):\n",
+ " pipeline = MLPipeline(template.to_dict()) # Make a copy of the template\n",
+ " if hyperparameters:\n",
+ " pipeline.set_hyperparameters(hyperparameters)\n",
+ "\n",
+ " pipeline.fit(X_train, y_train)\n",
+ " y_pred = pipeline.predict(X_test)\n",
+ " \n",
+ " scores.append(dataset.score(y_test, y_pred))\n",
+ " \n",
+ " return np.mean(scores)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Setup the BTBSession"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We will create another dictionary with the tunable hyperparameters of each template.\n",
+ "This will be used by the BTBSession to know how to tune each template."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tunables = {\n",
+ " name: template.get_tunable_hyperparameters(flat=True)\n",
+ " for name, template in templates_dict.items()\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+ " 'max_labels'): {'type': 'int', 'default': 0, 'range': [0, 100]},\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'lowercase'): {'type': 'bool', 'default': True},\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'binary'): {'type': 'bool', 'default': True},\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'max_features'): {'type': 'int', 'default': 1000, 'range': [1, 10000]},\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): {'type': 'str',\n",
+ " 'default': 'mean',\n",
+ " 'values': ['mean', 'median', 'most_frequent', 'constant']},\n",
+ " ('xgboost.XGBClassifier#1', 'n_estimators'): {'type': 'int',\n",
+ " 'default': 100,\n",
+ " 'range': [10, 1000]},\n",
+ " ('xgboost.XGBClassifier#1', 'max_depth'): {'type': 'int',\n",
+ " 'default': 3,\n",
+ " 'range': [3, 10]},\n",
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): {'type': 'float',\n",
+ " 'default': 0.1,\n",
+ " 'range': [0, 1]},\n",
+ " ('xgboost.XGBClassifier#1', 'gamma'): {'type': 'float',\n",
+ " 'default': 0,\n",
+ " 'range': [0, 1]},\n",
+ " ('xgboost.XGBClassifier#1', 'min_child_weight'): {'type': 'int',\n",
+ " 'default': 1,\n",
+ " 'range': [1, 10]}}"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tunables['single_table.classification.mlprimitives.xgboost']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And then create a `BTBSession` instance passing them and the `cross_validate` function.\n",
+ "\n",
+ "We will also be setting it in `verbose` mode, so we can have a better insight on what is going on."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from btb.session import BTBSession\n",
+ "\n",
+ "session = BTBSession(tunables, cross_validate, verbose=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5. Run the session"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "After everything is set up, we can start running the tuning session passing it\n",
+ "the number of iterations that we want to perform."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "fe9bb1cfdb2f48d4b6c8614ae1d357a1",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2020-01-23 20:16:01,059 - INFO - session - Creating Tunable instance from dict.\n",
+ "2020-01-23 20:16:01,060 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.logit\n",
+ "2020-01-23 20:16:03,274 - INFO - session - New optimal found: single_table.classification.categorical_encoder.logit - 0.7975185708718643\n",
+ "2020-01-23 20:16:03,284 - INFO - session - Creating Tunable instance from dict.\n",
+ "2020-01-23 20:16:03,285 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.random_forest\n",
+ "2020-01-23 20:16:05,584 - INFO - session - Creating Tunable instance from dict.\n",
+ "2020-01-23 20:16:05,585 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.xgboost\n",
+ "2020-01-23 20:16:10,613 - INFO - session - New optimal found: single_table.classification.categorical_encoder.xgboost - 0.8639171383183359\n",
+ "2020-01-23 20:16:10,617 - INFO - session - Creating Tunable instance from dict.\n",
+ "2020-01-23 20:16:10,618 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.logit\n",
+ "2020-01-23 20:16:13,090 - INFO - session - Creating Tunable instance from dict.\n",
+ "2020-01-23 20:16:13,093 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.random_forest\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'id': '51a54054874dd7a83ff0e785ffdfee3b',\n",
+ " 'name': 'single_table.classification.categorical_encoder.xgboost',\n",
+ " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+ " 'max_labels'): 0,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+ " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
+ " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
+ " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n",
+ " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n",
+ " 'score': 0.8639171383183359}"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "session.run(5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "During this loop, the BTBSession will build pipelines based on our templates and evaluate them\n",
+ "using our scoring function."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6. Evaluate results\n",
+ "\n",
+ "When the session funishes running it will return a the best proposal available and the\n",
+ "obtained score.\n",
+ "\n",
+ "These results are also available as the `best_proposal` attribute from the btb session object."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'id': '51a54054874dd7a83ff0e785ffdfee3b',\n",
+ " 'name': 'single_table.classification.categorical_encoder.xgboost',\n",
+ " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+ " 'max_labels'): 0,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+ " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
+ " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
+ " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n",
+ " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n",
+ " 'score': 0.8639171383183359}"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "session.best_proposal"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Continue Running"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If we feel that the score can still be improved and want to keep searching, we can simply run the session again which will continue tuning over the previous results."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a76ce44e1173496e99baaf7ee39a3df7",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2020-01-23 20:17:59,163 - INFO - session - Creating Tunable instance from dict.\n",
+ "2020-01-23 20:17:59,163 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.xgboost\n",
+ "2020-01-23 20:18:04,640 - INFO - session - Creating Tunable instance from dict.\n",
+ "2020-01-23 20:18:04,640 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives_text.xgboost\n",
+ "2020-01-23 20:18:04,779 - ERROR - mlpipeline - Exception caught producing MLBlock mlprimitives.custom.text.TextCleaner#1\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n",
+ " return self._engine.get_loc(key)\n",
+ " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ "KeyError: 'text'\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 635, in _produce_block\n",
+ " block_outputs = block.produce(**produce_args)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 322, in produce\n",
+ " return getattr(self.instance, self.produce_method)(**produce_kwargs)\n",
+ " File \"/home/xals/Projects/MIT/MLPrimitives/mlprimitives/custom/text.py\", line 111, in produce\n",
+ " texts = X[self.column]\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n",
+ " indexer = self.columns.get_loc(key)\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n",
+ " return self._engine.get_loc(self._maybe_cast_indexer(key))\n",
+ " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ "KeyError: 'text'\n",
+ "2020-01-23 20:18:04,799 - ERROR - session - Proposal 7 - single_table.classification.mlprimitives_text.xgboost crashed with the following configuration: ('mlprimitives.custom.text.TextCleaner#1', 'lower'): True\n",
+ "('mlprimitives.custom.text.TextCleaner#1', 'accents'): True\n",
+ "('mlprimitives.custom.text.TextCleaner#1', 'stopwords'): True\n",
+ "('mlprimitives.custom.text.TextCleaner#1', 'non_alpha'): True\n",
+ "('mlprimitives.custom.text.TextCleaner#1', 'single_chars'): True\n",
+ "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'lowercase'): True\n",
+ "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'binary'): True\n",
+ "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'max_features'): 1000\n",
+ "('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'n_estimators'): 10\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'criterion'): gini\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'max_features'): None\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'max_depth'): 1\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_split'): 2\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_leaf'): 1\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'min_weight_fraction_leaf'): 0.0\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'max_leaf_nodes'): 2\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'min_impurity_decrease'): 0.0\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n",
+ " return self._engine.get_loc(key)\n",
+ " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ "KeyError: 'text'\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n",
+ " score = self.scorer(tunable_name, config)\n",
+ " File \"\", line 11, in cross_validate\n",
+ " pipeline.fit(X_train, y_train)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 722, in fit\n",
+ " self._produce_block(block, block_name, context, output_variables, outputs)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 635, in _produce_block\n",
+ " block_outputs = block.produce(**produce_args)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 322, in produce\n",
+ " return getattr(self.instance, self.produce_method)(**produce_kwargs)\n",
+ " File \"/home/xals/Projects/MIT/MLPrimitives/mlprimitives/custom/text.py\", line 111, in produce\n",
+ " texts = X[self.column]\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n",
+ " indexer = self.columns.get_loc(key)\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n",
+ " return self._engine.get_loc(self._maybe_cast_indexer(key))\n",
+ " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ "KeyError: 'text'\n",
+ "2020-01-23 20:18:04,801 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.mlprimitives_text.xgboost\n",
+ "2020-01-23 20:18:04,803 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
+ "2020-01-23 20:18:22,026 - INFO - session - New optimal found: single_table.classification.categorical_encoder.xgboost - 0.8687079630193402\n",
+ "2020-01-23 20:18:22,031 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
+ "2020-01-23 20:19:13,106 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.logit\n",
+ "2020-01-23 20:19:13,334 - ERROR - mlpipeline - Exception caught fitting MLBlock sklearn.linear_model.LogisticRegression#1\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
+ " block.fit(**fit_args)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
+ " getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
+ " solver = _check_solver(self.solver, self.penalty, self.dual)\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
+ " \"got %s penalty.\" % (solver, penalty))\n",
+ "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n",
+ "2020-01-23 20:19:13,339 - ERROR - session - Proposal 10 - single_table.classification.categorical_encoder.logit crashed with the following configuration: ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 29\n",
+ "('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): False\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 71156\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'solver'): newton-cg\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'penalty'): l1\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'C'): 40.699406362214916\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'multi_class'): multinomial\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 933.5409791334005\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.0017748534037681438\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'dual'): True\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n",
+ " score = self.scorer(tunable_name, config)\n",
+ " File \"\", line 11, in cross_validate\n",
+ " pipeline.fit(X_train, y_train)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 719, in fit\n",
+ " self._fit_block(block, block_name, context)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
+ " block.fit(**fit_args)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
+ " getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
+ " solver = _check_solver(self.solver, self.penalty, self.dual)\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
+ " \"got %s penalty.\" % (solver, penalty))\n",
+ "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2020-01-23 20:19:13,340 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.categorical_encoder.logit\n",
+ "2020-01-23 20:19:13,343 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.logit\n",
+ "2020-01-23 20:19:26,076 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n",
+ "2020-01-23 20:19:31,573 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n",
+ "2020-01-23 20:19:34,763 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
+ "2020-01-23 20:20:15,775 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
+ "2020-01-23 20:21:49,655 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.logit\n",
+ "2020-01-23 20:21:49,946 - ERROR - mlpipeline - Exception caught fitting MLBlock sklearn.linear_model.LogisticRegression#1\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
+ " block.fit(**fit_args)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
+ " getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
+ " solver = _check_solver(self.solver, self.penalty, self.dual)\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
+ " \"got %s penalty.\" % (solver, penalty))\n",
+ "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n",
+ "2020-01-23 20:21:49,948 - ERROR - session - Proposal 16 - single_table.classification.mlprimitives.logit crashed with the following configuration: ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 97\n",
+ "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'lowercase'): True\n",
+ "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'binary'): True\n",
+ "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'max_features'): 4707\n",
+ "('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): True\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 26014\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'solver'): newton-cg\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'penalty'): l1\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'C'): 34.878827238511434\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'multi_class'): multinomial\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 406.1952335959628\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.008653762646621075\n",
+ "('sklearn.linear_model.LogisticRegression#1', 'dual'): True\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n",
+ " score = self.scorer(tunable_name, config)\n",
+ " File \"\", line 11, in cross_validate\n",
+ " pipeline.fit(X_train, y_train)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 719, in fit\n",
+ " self._fit_block(block, block_name, context)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
+ " block.fit(**fit_args)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
+ " getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
+ " solver = _check_solver(self.solver, self.penalty, self.dual)\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
+ " \"got %s penalty.\" % (solver, penalty))\n",
+ "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n",
+ "2020-01-23 20:21:49,951 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.mlprimitives.logit\n",
+ "2020-01-23 20:21:49,953 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n",
+ "2020-01-23 20:22:23,153 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n",
+ "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
+ " warn(\"Some inputs do not have OOB scores. \"\n",
+ "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
+ " predictions[k].sum(axis=1)[:, np.newaxis])\n",
+ "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
+ " warn(\"Some inputs do not have OOB scores. \"\n",
+ "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
+ " predictions[k].sum(axis=1)[:, np.newaxis])\n",
+ "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
+ " warn(\"Some inputs do not have OOB scores. \"\n",
+ "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
+ " predictions[k].sum(axis=1)[:, np.newaxis])\n",
+ "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
+ " warn(\"Some inputs do not have OOB scores. \"\n",
+ "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
+ " predictions[k].sum(axis=1)[:, np.newaxis])\n",
+ "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
+ " warn(\"Some inputs do not have OOB scores. \"\n",
+ "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
+ " predictions[k].sum(axis=1)[:, np.newaxis])\n",
+ "2020-01-23 20:22:24,832 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
+ "2020-01-23 20:22:46,026 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
+ "2020-01-23 20:22:53,670 - INFO - session - New optimal found: single_table.classification.mlprimitives.xgboost - 0.8739290413691612\n",
+ "2020-01-23 20:22:53,677 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n",
+ "2020-01-23 20:22:55,126 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n",
+ "2020-01-23 20:23:10,345 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
+ "2020-01-23 20:23:15,497 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
+ "2020-01-23 20:23:28,746 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'id': 'd9854a57d48100da0f3584dc4490301f',\n",
+ " 'name': 'single_table.classification.mlprimitives.xgboost',\n",
+ " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+ " 'max_labels'): 22,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'lowercase'): True,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'binary'): True,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'max_features'): 3863,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+ " ('xgboost.XGBClassifier#1', 'n_estimators'): 193,\n",
+ " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.29839198565184866,\n",
+ " ('xgboost.XGBClassifier#1', 'gamma'): 0.19826736959824165,\n",
+ " ('xgboost.XGBClassifier#1', 'min_child_weight'): 4},\n",
+ " 'score': 0.8739290413691612}"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "session.run(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**NOTE**: If you look at the logs you will notice how the BTBSession captures the errors that finds\n",
+ "while executing the pipelines and automatically discards the failing tempaltes to be able to continue\n",
+ "the tuning session without wasting time on them.\n",
+ "\n",
+ "The number of errors that we want to wait before discarding a template can be changed passing the\n",
+ "`max_errors` argument to the `BTBSession` when it is build.\n",
+ "\n",
+ "Isn't it cool?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Build the best pipeline\n",
+ "\n",
+ "Once we are satisfied with the results, we can then build an instance of the best pipeline\n",
+ "by reading the `best_proposal` attribute from the `session`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'id': 'd9854a57d48100da0f3584dc4490301f',\n",
+ " 'name': 'single_table.classification.mlprimitives.xgboost',\n",
+ " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+ " 'max_labels'): 22,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'lowercase'): True,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'binary'): True,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'max_features'): 3863,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+ " ('xgboost.XGBClassifier#1', 'n_estimators'): 193,\n",
+ " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.29839198565184866,\n",
+ " ('xgboost.XGBClassifier#1', 'gamma'): 0.19826736959824165,\n",
+ " ('xgboost.XGBClassifier#1', 'min_child_weight'): 4},\n",
+ " 'score': 0.8739290413691612}"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "best_proposal = session.best_proposal\n",
+ "best_proposal"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "template = templates_dict[best_proposal['name']]\n",
+ "\n",
+ "pipeline = MLPipeline(template.to_dict())\n",
+ "pipeline.set_hyperparameters(best_proposal['config'])\n",
+ "\n",
+ "pipeline.fit(dataset.data, dataset.target)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Explore other results\n",
+ "\n",
+ "Optionally, if we are interested in exploring the results of the previous proposals we can access them\n",
+ "in the `trials` attribute of the `session` object."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'id': '9dd9a11254f46b11ad42a12692b4965e',\n",
+ " 'name': 'single_table.classification.categorical_encoder.logit',\n",
+ " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+ " 'max_labels'): 0,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+ " ('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): True,\n",
+ " ('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 100,\n",
+ " ('sklearn.linear_model.LogisticRegression#1', 'solver'): 'liblinear',\n",
+ " ('sklearn.linear_model.LogisticRegression#1', 'penalty'): 'l2',\n",
+ " ('sklearn.linear_model.LogisticRegression#1', 'C'): 1.0,\n",
+ " ('sklearn.linear_model.LogisticRegression#1', 'multi_class'): 'ovr',\n",
+ " ('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 1.0,\n",
+ " ('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.0001,\n",
+ " ('sklearn.linear_model.LogisticRegression#1', 'dual'): False},\n",
+ " 'score': 0.7975185708718643},\n",
+ " {'id': 'f7ef0814341cee4f05280077b9b3de9c',\n",
+ " 'name': 'single_table.classification.categorical_encoder.random_forest',\n",
+ " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+ " 'max_labels'): 0,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+ " ('sklearn.ensemble.RandomForestClassifier#1', 'n_estimators'): 10,\n",
+ " ('sklearn.ensemble.RandomForestClassifier#1', 'criterion'): 'gini',\n",
+ " ('sklearn.ensemble.RandomForestClassifier#1', 'max_features'): None,\n",
+ " ('sklearn.ensemble.RandomForestClassifier#1', 'max_depth'): 1,\n",
+ " ('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_split'): 2,\n",
+ " ('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_leaf'): 1,\n",
+ " ('sklearn.ensemble.RandomForestClassifier#1',\n",
+ " 'min_weight_fraction_leaf'): 0.0,\n",
+ " ('sklearn.ensemble.RandomForestClassifier#1', 'max_leaf_nodes'): 2,\n",
+ " ('sklearn.ensemble.RandomForestClassifier#1', 'min_impurity_decrease'): 0.0,\n",
+ " ('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True,\n",
+ " ('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False},\n",
+ " 'score': 0.7591904454179904}]"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "list(session.proposals.values())[0:2]"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/setup.py b/setup.py
index 1e8ef2ad..6045c574 100644
--- a/setup.py
+++ b/setup.py
@@ -18,6 +18,12 @@
]
+examples_require = [
+ 'mlprimitives>=0.2.4.dev0',
+ 'jupyter==1.0.0'
+]
+
+
tests_require = [
'pytest>=3.4.2',
'pytest-cov>=2.6.0',
@@ -88,6 +94,7 @@
extras_require={
'dev': development_requires + tests_require,
'test': tests_require,
+ 'examples': examples_require,
},
include_package_data=True,
install_requires=install_requires,
From c2f862b55ec52e6b7c431fe741bd83f7366b6a09 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sun, 16 Feb 2020 15:20:27 -0500
Subject: [PATCH 084/160] Update tutorials
---
.gitignore | 1 +
examples/README.md | 57 ++
.../2. Finding and Loading a Pipeline.ipynb | 6 +-
.... Setting MLPipeline Hyperparameters.ipynb | 9 +-
...Saving and Loading a Pipeline-Copy1.ipynb} | 9 +-
...ial execution and pipeline debugging.ipynb | 712 ++++++++++++++++++
.../6. Flexible outputs specification.ipynb | 517 +++++++++++++
...eline.ipynb => 7. Tuning a Pipeline.ipynb} | 0
...r the best pipeline with BTBSession.ipynb} | 0
9 files changed, 1306 insertions(+), 5 deletions(-)
create mode 100644 examples/README.md
rename examples/tutorials/{4. Saving and Loading a Pipeline.ipynb => 4. Saving and Loading a Pipeline-Copy1.ipynb} (91%)
create mode 100644 examples/tutorials/5. Partial execution and pipeline debugging.ipynb
create mode 100644 examples/tutorials/6. Flexible outputs specification.ipynb
rename examples/tutorials/{5. Tuning a Pipeline.ipynb => 7. Tuning a Pipeline.ipynb} (100%)
rename examples/tutorials/{6. Searching for the best pipeline with BTBSession.ipynb => 8. Searching for the best pipeline with BTBSession.ipynb} (100%)
diff --git a/.gitignore b/.gitignore
index 011ff452..037d677e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -109,3 +109,4 @@ ENV/
.*.swp
mlblocks/data
+examples/tutorials/pipeline.pkl
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 00000000..12131c95
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,57 @@
+# MLBlocks Examples
+
+This folder contains Python code, Jupyter Notebooks and JSON examples to demonstrate MLBlocks
+functionaliry.
+
+Within this folder you will find:
+
+* `examples.py`: Simple Python code examples of a class and a function based primitive implementation.
+* `primitives`: Example primitive JSONs to demonstrate different MLBlocks functionalities.
+* `pipelines`: Example pipeline JSONs to demonstrate different MLBlocks functionalities.
+* `tutorials`: Collection of Jupyter Notebooks to show the usage of different MLBlocks functionalities.
+
+
+# Requirements
+
+In order to run the examples contained in this folder you should have [pip installed on your system
+](https://pip.pypa.io/en/stable/installing/).
+
+Optionally, also install and activate a [virtualenv](https://virtualenv.pypa.io/en/latest/) to
+run them in an isolated environment.
+
+# Usage
+
+In order to run these tutorials on your computer, please follow these steps:
+
+1. Clone this github repository:
+
+```bash
+git clone git@github.com:HDI-Project/MLBlocks.git
+```
+
+2. (Optional) Create a virtualenv to execute the examples in an environment isolated from the
+rest of your computer:
+
+```bash
+pip install virtualenv
+virtualenv -p $(which python3.6) mlblocks-venv
+soucre mlblocks-venv/bin/activate
+```
+
+3. Enter the repository and install the dependencies
+
+```bash
+cd MLBlocks
+make install-examples
+```
+
+This will install [MLBLocks](https://github.com/HDI-Project/MLBlocks.git) and also [MLPrimitives](
+https://github.com/HDI-Project/MLPrimitives.git) and [Jupyter](https://jupyter.org/).
+
+4. Enter the `examples` folder and start a Jupyter Notebook:
+
+```bash
+jupyter notebook
+```
+
+5. Point your browser at the link shown in your console and run the examples from the `examples/tutorials` folder.
diff --git a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb
index a94c48bc..8df76259 100644
--- a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb
+++ b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb
@@ -52,7 +52,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -63,7 +63,7 @@
" 'image.classification.resnet50.xgboost']"
]
},
- "execution_count": 8,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -89,7 +89,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
index 29f60a8f..0914e806 100644
--- a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
+++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
@@ -51,7 +51,14 @@
"source": [
"from mlblocks import MLPipeline\n",
"\n",
- "pipeline = MLPipeline('single_table.classification.categorical_encoder.xgboost')"
+ "primitives = [\n",
+ " 'mlprimitives.custom.preprocessing.ClassEncoder',\n",
+ " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
+ " 'sklearn.impute.SimpleImputer',\n",
+ " 'xgboost.XGBClassifier',\n",
+ " 'mlprimitives.custom.preprocessing.ClassDecoder'\n",
+ "]\n",
+ "pipeline = MLPipeline(primitives)"
]
},
{
diff --git a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb b/examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb
similarity index 91%
rename from examples/tutorials/4. Saving and Loading a Pipeline.ipynb
rename to examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb
index 193daaf3..f8a0a5b3 100644
--- a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb
+++ b/examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb
@@ -57,7 +57,14 @@
"source": [
"from mlblocks import MLPipeline\n",
"\n",
- "pipeline = MLPipeline('single_table.classification.categorical_encoder.xgboost')"
+ "primitives = [\n",
+ " 'mlprimitives.custom.preprocessing.ClassEncoder',\n",
+ " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
+ " 'sklearn.impute.SimpleImputer',\n",
+ " 'xgboost.XGBClassifier',\n",
+ " 'mlprimitives.custom.preprocessing.ClassDecoder'\n",
+ "]\n",
+ "pipeline = MLPipeline(primitives)"
]
},
{
diff --git a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb
new file mode 100644
index 00000000..2e21c85b
--- /dev/null
+++ b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb
@@ -0,0 +1,712 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Partial execution and pipeline debugging\n",
+ "\n",
+ "In this guide we will show you how to execute a pipeline partially in order to\n",
+ "debug its internal behavior or optimize tuning processes.\n",
+ "\n",
+ "Note that some steps are not explained for simplicity. Full details\n",
+ "about them can be found in the previous parts of the tutorial.\n",
+ "\n",
+ "We will:\n",
+ "\n",
+ "1. Load a pipeline and a dataset\n",
+ "2. Explore the context after fitting the first primitive.\n",
+ "3. Fit the rest of the pipeline\n",
+ "4. Partial execution during Predict\n",
+ "5. Rerunning the last steps"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load a pipeline and a datset\n",
+ "\n",
+ "The first step will be to load the Census dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlprimitives.datasets import load_dataset\n",
+ "\n",
+ "dataset = load_dataset('census')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train, X_test, y_train, y_test = dataset.get_splits(1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "As a reminder, we have a loot at what the `X` and `y` variables that we will be passing to our\n",
+ "pipeline look like."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "`X` is a `pandas.DataFrame` that conatins the demographics data of the subjects:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " age | \n",
+ " workclass | \n",
+ " fnlwgt | \n",
+ " education | \n",
+ " education-num | \n",
+ " marital-status | \n",
+ " occupation | \n",
+ " relationship | \n",
+ " race | \n",
+ " sex | \n",
+ " capital-gain | \n",
+ " capital-loss | \n",
+ " hours-per-week | \n",
+ " native-country | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 28291 | \n",
+ " 25 | \n",
+ " Private | \n",
+ " 193379 | \n",
+ " Assoc-acdm | \n",
+ " 12 | \n",
+ " Never-married | \n",
+ " Craft-repair | \n",
+ " Not-in-family | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 45 | \n",
+ " United-States | \n",
+ "
\n",
+ " \n",
+ " | 28636 | \n",
+ " 55 | \n",
+ " Federal-gov | \n",
+ " 176904 | \n",
+ " HS-grad | \n",
+ " 9 | \n",
+ " Married-civ-spouse | \n",
+ " Exec-managerial | \n",
+ " Husband | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ "
\n",
+ " \n",
+ " | 7919 | \n",
+ " 30 | \n",
+ " Private | \n",
+ " 284395 | \n",
+ " HS-grad | \n",
+ " 9 | \n",
+ " Married-civ-spouse | \n",
+ " Craft-repair | \n",
+ " Husband | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 50 | \n",
+ " United-States | \n",
+ "
\n",
+ " \n",
+ " | 24861 | \n",
+ " 17 | \n",
+ " Private | \n",
+ " 239346 | \n",
+ " 10th | \n",
+ " 6 | \n",
+ " Never-married | \n",
+ " Other-service | \n",
+ " Own-child | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 18 | \n",
+ " United-States | \n",
+ "
\n",
+ " \n",
+ " | 23480 | \n",
+ " 51 | \n",
+ " Private | \n",
+ " 57698 | \n",
+ " HS-grad | \n",
+ " 9 | \n",
+ " Married-spouse-absent | \n",
+ " Other-service | \n",
+ " Unmarried | \n",
+ " White | \n",
+ " Female | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " age workclass fnlwgt education education-num \\\n",
+ "28291 25 Private 193379 Assoc-acdm 12 \n",
+ "28636 55 Federal-gov 176904 HS-grad 9 \n",
+ "7919 30 Private 284395 HS-grad 9 \n",
+ "24861 17 Private 239346 10th 6 \n",
+ "23480 51 Private 57698 HS-grad 9 \n",
+ "\n",
+ " marital-status occupation relationship race \\\n",
+ "28291 Never-married Craft-repair Not-in-family White \n",
+ "28636 Married-civ-spouse Exec-managerial Husband White \n",
+ "7919 Married-civ-spouse Craft-repair Husband White \n",
+ "24861 Never-married Other-service Own-child White \n",
+ "23480 Married-spouse-absent Other-service Unmarried White \n",
+ "\n",
+ " sex capital-gain capital-loss hours-per-week native-country \n",
+ "28291 Male 0 0 45 United-States \n",
+ "28636 Male 0 0 40 United-States \n",
+ "7919 Male 0 0 50 United-States \n",
+ "24861 Male 0 0 18 United-States \n",
+ "23480 Female 0 0 40 United-States "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And `y` is a `numpy.ndarray` that contains the label that indicates whether the subject has a salary\n",
+ "above or under 50K."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K'], dtype=object)"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y_train[0:5]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And we build a suitable pipeline for our dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlblocks import MLPipeline\n",
+ "\n",
+ "primitives = [\n",
+ " 'mlprimitives.custom.preprocessing.ClassEncoder',\n",
+ " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
+ " 'sklearn.impute.SimpleImputer',\n",
+ " 'xgboost.XGBClassifier',\n",
+ " 'mlprimitives.custom.preprocessing.ClassDecoder'\n",
+ "]\n",
+ "pipeline = MLPipeline(primitives)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Explore the context after fitting the first primitive\n",
+ "\n",
+ "Once we know what primitives we are executing, we will execute only the first one\n",
+ "and see how the context changed after it.\n",
+ "\n",
+ "For this, we will execute the `fit` method passing the index of the last pipeline\n",
+ "step that we want to execute before returning. In this case, `0`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fit_context = pipeline.fit(X_train, y_train, output_=0)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**NOTE**: Optionally, instead of passing the pipeline step index, we could pass the complete name\n",
+ "of the step, including the counter number: `mlprimitives.custom.preprocessing.ClassEncoder#1`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_step = 'mlprimitives.custom.preprocessing.ClassEncoder#1'\n",
+ "fit_context = pipeline.fit(X_train, y_train, output_=output_step)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In both cases, the output will be a dictionary containing all the context variables after\n",
+ "fitting and producing the first pipeline step."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dict_keys(['X', 'y', 'classes'])"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fit_context.keys()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Notice how we find the `X` and `y` variables that we passed to the `fit` method, but also a new `classes` variable\n",
+ "that was generated by the `mlprimitives.custom.preprocessing.ClassEncoder` primitive of the first pipeline step.\n",
+ "\n",
+ "This `classes` variable contains the list of unique values that the variable `y` originally had."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([' <=50K', ' >50K'], dtype=object)"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fit_context['classes']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Also notice that the variable `y` has been transformed by the primitive into an array of\n",
+ "integer values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([0, 0, 0, 0, 0])"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fit_context['y'][0:5]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Fit the rest of the pipeline\n",
+ "\n",
+ "After exploring the context generated by the first pipeline step we will now run\n",
+ "a few steps more, up to the point where the feature matrix is ready for the XGBClassifier.\n",
+ "\n",
+ "For this we will run the `fit` method again passing back the context that we just obtained\n",
+ "as well as the `start_` argument indicating that we need to start fitting on the second\n",
+ "step of the pipeline, skipping the first one, and the `output_` argument indicating that\n",
+ "we want to stop on the third step, right before the `XGBClassifier` primitive.\n",
+ "\n",
+ "Note how the context is passed using a double asterisk `**` syntax, but that individual\n",
+ "variables could also be passed as keyword arguments."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fit_context = pipeline.fit(start_=1, output_=2, **fit_context)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now the context still contains the same variables as before"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dict_keys(['classes', 'X', 'y'])"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fit_context.keys()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "But the variable `X` has been completely modified by the CategoricalEncoder and Imputer\n",
+ "primitives, so now it is a 100% numerical `numpy.ndarray` ready for the `XGBClassifier`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([2.50000e+01, 1.93379e+05, 1.20000e+01, 0.00000e+00, 0.00000e+00,\n",
+ " 4.50000e+01, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n",
+ " 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+ " 0.00000e+00, 0.00000e+00, 0.00000e+00])"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fit_context['X'][0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Finally, we can pass the new context to the rest of the pipeline to finish fitting it.\n",
+ "\n",
+ "Note how, just like the `output_`, the `start_` step can also be indicated using the step\n",
+ "name instead of the index."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pipeline.fit(start_='xgboost.XGBClassifier#1', **fit_context)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Partial execution during Predict\n",
+ "\n",
+ "Just like in the `fit` stage, the `predict` method also accepts a partial output specification."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "predict_context = pipeline.predict(X_test, output_=2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dict_keys(['X', 'y'])"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "predict_context.keys()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "As well as a partial execution after a specific pipeline step"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "predictions = pipeline.predict(start_=3, **predict_context)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([' >50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "predictions[0:5]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Rerunning the last steps\n",
+ "\n",
+ "One of the key advantages of the partial execution that we just explored is the\n",
+ "possibility to re-fit and make new predictions multiple times with different\n",
+ "hyperparameter values for the last half of the pipeline without the need to\n",
+ "re-fit and re-execute the first half.\n",
+ "\n",
+ "This has the potential to greatly accelerate tuning processes in cases where there\n",
+ "are no tunable hyperparameters (or there are but we do not want to tune them) in\n",
+ "the preprocessing steps but the execution times are long.\n",
+ "\n",
+ "As an example, let's evaluate the performance of the pipeline and try to optimize\n",
+ "it by changing some hyperparameters of the classifier."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.8602137329566393"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset.score(y_test, predictions)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "hyperparameters = {\n",
+ " 'xgboost.XGBClassifier#1': {\n",
+ " 'learning_rate': 0.5\n",
+ " }\n",
+ "}\n",
+ "pipeline.set_hyperparameters(hyperparameters)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pipeline.fit(start_=3, **fit_context)\n",
+ "predictions = pipeline.predict(start_=3, **predict_context)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.872251566146665"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset.score(y_test, predictions)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tutorials/6. Flexible outputs specification.ipynb b/examples/tutorials/6. Flexible outputs specification.ipynb
new file mode 100644
index 00000000..3dc3686f
--- /dev/null
+++ b/examples/tutorials/6. Flexible outputs specification.ipynb
@@ -0,0 +1,517 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Flexible outputs specification\n",
+ "\n",
+ "In a previous tutorial we have learnt how to obtain intermediate pipeline\n",
+ "outputs in order to debug its internal behavior.\n",
+ "\n",
+ "In this guide we will go a bit further and learn how to define flexible outputs\n",
+ "for the pipeline in order to obtain the output of multiple primitives\n",
+ "at once.\n",
+ "\n",
+ "Note that some steps are not explained for simplicity. Full details\n",
+ "about them can be found in the previous parts of the tutorial.\n",
+ "\n",
+ "We will:\n",
+ "\n",
+ "1. Load a pipeline and a dataset\n",
+ "2. Explore the output specification formats"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load a pipeline and a datset\n",
+ "\n",
+ "The first step will be to load the Census dataset and the pipeline that we will be using."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlprimitives.datasets import load_dataset\n",
+ "\n",
+ "dataset = load_dataset('census')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train, X_test, y_train, y_test = dataset.get_splits(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlblocks import MLPipeline\n",
+ "\n",
+ "primitives = [\n",
+ " 'mlprimitives.custom.preprocessing.ClassEncoder',\n",
+ " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
+ " 'sklearn.impute.SimpleImputer',\n",
+ " 'xgboost.XGBClassifier',\n",
+ " 'mlprimitives.custom.preprocessing.ClassDecoder'\n",
+ "]\n",
+ "pipeline = MLPipeline(primitives)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Also, just as a reminder, let's have a quick look at the steps of this pipeline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['mlprimitives.custom.preprocessing.ClassEncoder',\n",
+ " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
+ " 'sklearn.impute.SimpleImputer',\n",
+ " 'xgboost.XGBClassifier',\n",
+ " 'mlprimitives.custom.preprocessing.ClassDecoder']"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pipeline.primitives"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And at the `X` and `y` variables that we will be passing to our pipeline."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "`X` is a `pandas.DataFrame` that conatins the demographics data of the subjects:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " age | \n",
+ " workclass | \n",
+ " fnlwgt | \n",
+ " education | \n",
+ " education-num | \n",
+ " marital-status | \n",
+ " occupation | \n",
+ " relationship | \n",
+ " race | \n",
+ " sex | \n",
+ " capital-gain | \n",
+ " capital-loss | \n",
+ " hours-per-week | \n",
+ " native-country | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 28291 | \n",
+ " 25 | \n",
+ " Private | \n",
+ " 193379 | \n",
+ " Assoc-acdm | \n",
+ " 12 | \n",
+ " Never-married | \n",
+ " Craft-repair | \n",
+ " Not-in-family | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 45 | \n",
+ " United-States | \n",
+ "
\n",
+ " \n",
+ " | 28636 | \n",
+ " 55 | \n",
+ " Federal-gov | \n",
+ " 176904 | \n",
+ " HS-grad | \n",
+ " 9 | \n",
+ " Married-civ-spouse | \n",
+ " Exec-managerial | \n",
+ " Husband | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ "
\n",
+ " \n",
+ " | 7919 | \n",
+ " 30 | \n",
+ " Private | \n",
+ " 284395 | \n",
+ " HS-grad | \n",
+ " 9 | \n",
+ " Married-civ-spouse | \n",
+ " Craft-repair | \n",
+ " Husband | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 50 | \n",
+ " United-States | \n",
+ "
\n",
+ " \n",
+ " | 24861 | \n",
+ " 17 | \n",
+ " Private | \n",
+ " 239346 | \n",
+ " 10th | \n",
+ " 6 | \n",
+ " Never-married | \n",
+ " Other-service | \n",
+ " Own-child | \n",
+ " White | \n",
+ " Male | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 18 | \n",
+ " United-States | \n",
+ "
\n",
+ " \n",
+ " | 23480 | \n",
+ " 51 | \n",
+ " Private | \n",
+ " 57698 | \n",
+ " HS-grad | \n",
+ " 9 | \n",
+ " Married-spouse-absent | \n",
+ " Other-service | \n",
+ " Unmarried | \n",
+ " White | \n",
+ " Female | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40 | \n",
+ " United-States | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " age workclass fnlwgt education education-num \\\n",
+ "28291 25 Private 193379 Assoc-acdm 12 \n",
+ "28636 55 Federal-gov 176904 HS-grad 9 \n",
+ "7919 30 Private 284395 HS-grad 9 \n",
+ "24861 17 Private 239346 10th 6 \n",
+ "23480 51 Private 57698 HS-grad 9 \n",
+ "\n",
+ " marital-status occupation relationship race \\\n",
+ "28291 Never-married Craft-repair Not-in-family White \n",
+ "28636 Married-civ-spouse Exec-managerial Husband White \n",
+ "7919 Married-civ-spouse Craft-repair Husband White \n",
+ "24861 Never-married Other-service Own-child White \n",
+ "23480 Married-spouse-absent Other-service Unmarried White \n",
+ "\n",
+ " sex capital-gain capital-loss hours-per-week native-country \n",
+ "28291 Male 0 0 45 United-States \n",
+ "28636 Male 0 0 40 United-States \n",
+ "7919 Male 0 0 50 United-States \n",
+ "24861 Male 0 0 18 United-States \n",
+ "23480 Female 0 0 40 United-States "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And `y` is a `numpy.ndarray` that contains the label that indicates whether the subject has a salary\n",
+ "above or under 50K."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K'], dtype=object)"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y_train[0:5]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Explore the output specification formats\n",
+ "\n",
+ "In the previous tutorial we learnt that the output of a pipeline can be specified\n",
+ "in multiple formats:\n",
+ "\n",
+ "* An integer indicating the pipeline step index, which will return us the complete\n",
+ " context after producing the corresponding step.\n",
+ "* A string indicating the name of a step, which will also return us the complete\n",
+ " context after producing the corresponding step.\n",
+ " \n",
+ "A part from these two options, there are a few more.\n",
+ "\n",
+ "### Single variable specification\n",
+ "\n",
+ "Variables can be individually specified by passing a string in the format\n",
+ "`{pipeline-step-name}.{variable-name}`.\n",
+ "\n",
+ "Note that the `pipeline-step-name` part is not only the primitive name, but\n",
+ "also the counter number at the end of it.\n",
+ "\n",
+ "For example, if we want to explore the `classes` variable generated by\n",
+ "the `ClassEncoder` primitive during `fit`, we can do the following:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([' <=50K', ' >50K'], dtype=object)"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "output_spec = 'mlprimitives.custom.preprocessing.ClassEncoder#1.classes'\n",
+ "pipeline.fit(X_train, y_train, output_=output_spec)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**NOTE**: Just like with the full context specification, when a variable is specified\n",
+ "the pipeline will be executed only up to the step that produces the indicated variable."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### List of variables\n",
+ "\n",
+ "In some cases we will be interested in obtaining more than one variable\n",
+ "at a time.\n",
+ "\n",
+ "In order to do this, instead of a single string specification we can pass\n",
+ "a list of strings."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_spec = [\n",
+ " 'mlprimitives.custom.preprocessing.ClassEncoder#1.y',\n",
+ " 'mlprimitives.custom.preprocessing.ClassEncoder#1.classes',\n",
+ "]\n",
+ "out = pipeline.fit(X_train, y_train, output_=output_spec)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The output will be a `tuple` containing the variables in the specified order."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y, classes = out"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If we want to obtain variables from multiple pipeline steps we simply need\n",
+ "to specify all of them at once. Again, **MLBlocks** will run all the necessary\n",
+ "pipeline steps, accumulating the desired variables up to the last step needed."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_spec = [\n",
+ " 'sklearn.impute.SimpleImputer#1.X',\n",
+ " 'mlprimitives.custom.preprocessing.ClassEncoder#1.y',\n",
+ " 'mlprimitives.custom.preprocessing.ClassEncoder#1.classes',\n",
+ "]\n",
+ "X, y, classes = pipeline.fit(X_train, y_train, output_=output_spec)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If required, we can even capture the same variable along the different pipeline steps!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_spec = [\n",
+ " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1.X',\n",
+ " 'sklearn.impute.SimpleImputer#1.X',\n",
+ " 'mlprimitives.custom.preprocessing.ClassEncoder#1.y',\n",
+ " 'mlprimitives.custom.preprocessing.ClassEncoder#1.classes',\n",
+ "]\n",
+ "X_1, X_2, y, classes = pipeline.fit(X_train, y_train, output_=output_spec)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(24420, 108)"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_1.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(24420, 108)"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_2.shape"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tutorials/5. Tuning a Pipeline.ipynb b/examples/tutorials/7. Tuning a Pipeline.ipynb
similarity index 100%
rename from examples/tutorials/5. Tuning a Pipeline.ipynb
rename to examples/tutorials/7. Tuning a Pipeline.ipynb
diff --git a/examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
similarity index 100%
rename from examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb
rename to examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
From 03c7a2d07d15f6e69e448e72860fc4b18ad60ac9 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sun, 16 Feb 2020 17:30:57 -0500
Subject: [PATCH 085/160] Simplify README and make docs based on it
---
README.md | 190 +++++-----------------------
docs/conf.py | 4 +-
docs/getting_started/install.rst | 57 ---------
docs/getting_started/quickstart.rst | 125 ------------------
docs/index.rst | 54 +-------
docs/readme.rst | 1 +
6 files changed, 37 insertions(+), 394 deletions(-)
delete mode 100644 docs/getting_started/install.rst
delete mode 100644 docs/getting_started/quickstart.rst
create mode 100644 docs/readme.rst
diff --git a/README.md b/README.md
index 7c152fa3..f3a6e3d7 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
-
+
An open source project from Data to AI Lab at MIT.
@@ -16,12 +16,12 @@ Pipelines and Primitives for Machine Learning and Data Science.
[](https://codecov.io/gh/HDI-Project/MLBlocks)
[](https://pepy.tech/project/mlblocks)
+# Overview
+
* Free software: MIT license
* Documentation: https://HDI-Project.github.io/MLBlocks
* Homepage: https://github.com/HDI-Project/MLBlocks
-# MLBlocks
-
MLBlocks is a simple framework for composing end-to-end tunable Machine Learning Pipelines by
seamlessly combining tools from any python library with a simple, common and uniform interface.
@@ -47,25 +47,10 @@ Also, although it is not strictly required, the usage of a
[virtualenv](https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid
interfering with other software installed in the system where **MLBlocks** is run.
-These are the minimum commands needed to create a virtualenv using python3.6 for **MLBlocks**:
-
-```bash
-pip install virtualenv
-virtualenv -p $(which python3.6) mlblocks-venv
-```
-
-Afterwards, you have to execute this command to have the virtualenv activated:
-
-```bash
-source mlblocks-venv/bin/activate
-```
-
-Remember about executing it every time you start a new console to work on **MLBlocks**!
-
## Install with pip
-After creating the virtualenv and activating it, we recommend using
-[pip](https://pip.pypa.io/en/stable/) in order to install **MLBlocks**:
+The easiest and recommended way to install **MLBlocks** is using [pip](
+https://pip.pypa.io/en/stable/):
```bash
pip install mlblocks
@@ -73,46 +58,8 @@ pip install mlblocks
This will pull and install the latest stable release from [PyPi](https://pypi.org/).
-## Install from source
-
-Alternatively, with your virtualenv activated, you can clone the repository and install it from
-source by running `make install` on the `stable` branch:
-
-```bash
-git clone git@github.com:HDI-Project/MLBlocks.git
-cd MLBlocks
-git checkout stable
-make install
-```
-
-## Install for Development
-
-If you want to contribute to the project, a few more steps are required to make the project ready
-for development.
-
-First, please head to [the GitHub page of the project](https://github.com/HDI-Project/MLBlocks)
-and make a fork of the project under you own username by clicking on the **fork** button on the
-upper right corner of the page.
-
-Afterwards, clone your fork and create a branch from master with a descriptive name that includes
-the number of the issue that you are going to work on:
-
-```bash
-git clone git@github.com:{your username}/MLBlocks.git
-cd MLBlocks
-git branch issue-xx-cool-new-feature master
-git checkout issue-xx-cool-new-feature
-```
-
-Finally, install the project with the following command, which will install some additional
-dependencies for code linting and testing.
-
-```bash
-make install-develop
-```
-
-Make sure to use them regularly while developing by running the commands `make lint` and `make test`.
-
+If you want to install from source or contribute to the project please read the
+[Contributing Guide](https://hdi-project.github.io/MLBlocks/contributing.html#get-started).
## MLPrimitives
@@ -128,118 +75,43 @@ pip install mlprimitives
# Quickstart
-Below there is a short example about how to use MLBlocks to create a simple pipeline, fit it
-using demo data and use it to make predictions.
+Below there is a short example about how to use **MLBlocks** to solve a prediction problem
+using the primitives and pipelines from [MLPrimitives](https://github.com/HDI-Project/MLPrimitives).
-Please make sure to also having installed [MLPrimitives](https://github.com/HDI-Project/MLPrimitives)
-before following it.
+```python3
+from mlblocks import MLPipeline
+from mlprimitives.datasets import load_dataset
-For advance usage and more detailed explanation about each component, please have a look
-at the [documentation](https://HDI-Project.github.io/MLBlocks)
+dataset = load_dataset('census')
+X_train, X_test, y_train, y_test = dataset.get_splits(1)
-## Creating a pipeline
+primitives = [
+ 'mlprimitives.custom.preprocessing.ClassEncoder',
+ 'mlprimitives.custom.feature_extraction.CategoricalEncoder',
+ 'sklearn.impute.SimpleImputer',
+ 'xgboost.XGBClassifier',
+ 'mlprimitives.custom.preprocessing.ClassDecoder'
+]
+pipeline = MLPipeline(primitives)
-With MLBlocks, creating a pipeline is as simple as specifying a list of primitives and passing
-them to the `MLPipeline` class.
+pipeline.fit(X_train, y_train)
+predictions = pipeline.predict(X_test)
-```python
->>> from mlblocks import MLPipeline
-... primitives = [
-... 'cv2.GaussianBlur',
-... 'skimage.feature.hog',
-... 'sklearn.ensemble.RandomForestClassifier'
-... ]
->>> pipeline = MLPipeline(primitives)
-```
-
-Optionally, specific initialization arguments can be also set by specifying them in a dictionary:
-
-```python
->>> init_params = {
-... 'skimage.feature.hog': {
-... 'multichannel': True,
-... 'visualize': False
-... },
-... 'sklearn.ensemble.RandomForestClassifier': {
-... 'n_estimators': 100,
-... }
-... }
->>> pipeline = MLPipeline(primitives, init_params=init_params)
-```
-
-If you can see which hyperparameters a particular pipeline is using, you can do so by calling
-its `get_hyperparameters` method:
-
-```python
->>> import json
->>> hyperparameters = pipeline.get_hyperparameters()
->>> print(json.dumps(hyperparameters, indent=4))
-{
- "cv2.GaussianBlur#1": {
- "ksize_width": 3,
- "ksize_height": 3,
- "sigma_x": 0,
- "sigma_y": 0
- },
- "skimage.feature.hog#1": {
- "multichannel": true,
- "visualize": false,
- "orientations": 9,
- "pixels_per_cell_x": 8,
- "pixels_per_cell_y": 8,
- "cells_per_block_x": 3,
- "cells_per_block_y": 3,
- "block_norm": null
- },
- "sklearn.ensemble.RandomForestClassifier#1": {
- "n_jobs": -1,
- "n_estimators": 100,
- "criterion": "entropy",
- "max_features": null,
- "max_depth": 10,
- "min_samples_split": 0.1,
- "min_samples_leaf": 0.1,
- "class_weight": null
- }
-}
-```
-
-## Making predictions
-
-Once we have created the pipeline with the desired hyperparameters we can fit it
-and then use it to make predictions on new data.
-
-To do this, we first call the `fit` method passing the training data and the corresponding labels.
-
-In this case in particular, we will be loading the handwritten digit classification dataset
-from USPS using the `mlblocks.datasets.load_usps` method, which returns a dataset object
-ready to be played with.
-
-```python
->>> from mlblocks.datasets import load_usps
->>> dataset = load_usps()
->>> X_train, X_test, y_train, y_test = dataset.get_splits(1)
->>> pipeline.fit(X_train, y_train)
-```
-
-Once we have fitted our model to our data, we can call the `predict` method passing new data
-to obtain predictions from the pipeline.
-
-```python
->>> predictions = pipeline.predict(X_test)
->>> predictions
-array([3, 2, 1, ..., 1, 1, 2])
+dataset.score(y_test, predictions)
```
# What's Next?
If you want to learn more about how to tune the pipeline hyperparameters, save and load
the pipelines using JSON annotations or build complex multi-branched pipelines, please
-check our [documentation](https://HDI-Project.github.io/MLBlocks).
+check our [documentation site](https://HDI-Project.github.io/MLBlocks).
+
+Also do not forget to have a look at the [notebook tutorials](
+https://github.com/D3-AI/GreenGuard/tree/master/examples/tutorials)!
-## Citing MLBlocks
+# Citing MLBlocks
-If you use MLBlocks, please consider citing our related papers.
+If you use MLBlocks for your research, please consider citing our related papers.
For the current design of MLBlocks and its usage within the larger *Machine Learning Bazaar* project at
the MIT Data To AI Lab, please see:
diff --git a/docs/conf.py b/docs/conf.py
index 95653914..5ff266d0 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -27,7 +27,6 @@
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
-
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = [
@@ -47,7 +46,6 @@
}
ipython_execlines = ["import pandas as pd", "pd.set_option('display.width', 1000000)"]
-
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
@@ -116,7 +114,7 @@
# documentation.
html_theme_options = {
'collapse_navigation': False,
- 'display_version': False,
+ 'display_version': True,
}
# Add any paths that contain custom static files (such as style sheets) here,
diff --git a/docs/getting_started/install.rst b/docs/getting_started/install.rst
deleted file mode 100644
index 4163f3bd..00000000
--- a/docs/getting_started/install.rst
+++ /dev/null
@@ -1,57 +0,0 @@
-.. highlight:: shell
-
-Installation
-============
-
-From PyPi
----------
-
-The simplest and recommended way to install MLBlocks is using `pip`:
-
-.. code-block:: console
-
- pip install mlblocks
-
-If you don't have `pip`_ installed, this `Python installation guide`_ can guide
-you through the process.
-
-.. _pip: https://pip.pypa.io
-.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
-
-From sources
-------------
-
-The sources for MLBlocks can be downloaded from the `Github repo`_.
-
-You can either clone the public repository:
-
-.. code-block:: console
-
- git clone git://github.com/HDI-Project/MLBlocks
-
-Or download the `tarball`_:
-
-.. code-block:: console
-
- curl -OL https://github.com/HDI-Project/MLBlocks/tarball/master
-
-Once you have a copy of the source, you can install it running the next command inside the
-project folder:
-
-.. code-block:: console
-
- $ make install
-
-.. _Github repo: https://github.com/HDI-Project/MLBlocks
-.. _tarball: https://github.com/HDI-Project/MLBlocks/tarball/master
-
-Development
------------
-
-If you are installing **MLBlocks** in order to modify its code, the installation must be done
-from its sources, in the editable mode, and also including some additional dependencies in
-order to be able to run the tests and build the documentation:
-
-.. code-block:: console
-
- make install-develop
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
deleted file mode 100644
index 31be89ee..00000000
--- a/docs/getting_started/quickstart.rst
+++ /dev/null
@@ -1,125 +0,0 @@
-Quickstart
-==========
-
-Below is a short tutorial that will show you how to get started using **MLBlocks**.
-
-In this tutorial we will learn how to:
-
-* Create a pipeline using multiple primitives
-* Obtain the list of tunable hyperparameters from the pipeline
-* Specify hyperparameters for each primitive in the pipeline
-* Fit the pipeline using training data
-* Use the pipeline to make predictions from new data
-
-.. note:: Some additional dependencies are required in order to run this Quickstart.
- Make sure that `you have already installed them`_.
-
-Creating a pipeline
--------------------
-
-With MLBlocks, creating a pipeline is as simple as specifying a list of primitives and passing
-them to the `MLPipeline class`_:
-
-.. ipython:: python
-
- from mlblocks import MLPipeline
- primitives = [
- 'mlprimitives.custom.feature_extraction.CategoricalEncoder',
- 'mlprimitives.custom.feature_extraction.StringVectorizer',
- 'sklearn.ensemble.RandomForestClassifier',
- ]
- pipeline = MLPipeline(primitives)
-
-Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary and
-passing them as the ``init_params`` argument:
-
-.. ipython:: python
-
- init_params = {
- 'sklearn.ensemble.RandomForestClassifier': {
- 'n_estimators': 100
- }
- }
- pipeline = MLPipeline(primitives, init_params=init_params)
-
-Once the pipeline has been instantiated, we can easily see what `hyperparameters`_ have been set
-for each block, by calling the `get_hyperparameters method`_.
-
-The output of this method is a dictionary which has the name of each block as keys and
-a dictionary with the `hyperparameters`_ of the corresponding block as values.
-
-.. ipython:: python
-
- pipeline.get_hyperparameters()
-
-Tunable Hyperparameters
------------------------
-
-One of the main features of `MLBlocks JSON Annotations`_ is the possibility to indicate
-the type and possible values that each primitive hyperparameter accepts.
-
-The list of possible hyperparameters and their details can easily be obtained from the pipeline
-instance by calling its `get_tunable_hyperparameters method`_.
-
-The output of this method is a dictionary that contains the list of tunable hyperparameters
-for each block in the pipeline, ready to be passed to any hyperparameter tuning library such
-as `BTB`_.
-
-.. ipython:: python
-
- pipeline.get_tunable_hyperparameters()
-
-Setting Hyperparameters
------------------------
-
-Modifying the hyperparameters of an already instantiated pipeline can be done using the
-`set_hyperparameters method`_, which expects a dictionary with the same format as the returned
-by the `get_hyperparameters method`_.
-
-Note that if a subset of the hyperparameters is passed, only these will be modified, and the
-other ones will remain unmodified.
-
-.. ipython:: python
-
- new_hyperparameters = {
- 'sklearn.ensemble.RandomForestClassifier#1': {
- 'max_depth': 15
- }
- }
- pipeline.set_hyperparameters(new_hyperparameters)
- hyperparameters = pipeline.get_hyperparameters()
- hyperparameters['sklearn.ensemble.RandomForestClassifier#1']['max_depth']
-
-Making predictions
-------------------
-
-Once we have created the pipeline with the desired hyperparameters we can fit it
-and then use it to make predictions on new data.
-
-To do this, we first call the ``fit`` method passing the training data and the corresponding
-labels.
-
-.. ipython:: python
-
- from mlblocks.datasets import load_personae
- dataset = load_personae()
- X_train, X_test, y_train, y_test = dataset.get_splits(1)
- pipeline.fit(X_train, y_train)
-
-Once we have fitted our model to our data, we can call the ``predict`` method passing new data
-to obtain predictions from the pipeline.
-
-.. ipython:: python
-
- predictions = pipeline.predict(X_test)
- predictions
- dataset.score(y_test, predictions)
-
-.. _you have already installed them: install.html#additional-dependencies
-.. _MLPipeline class: ../api_reference.html#mlblocks.MLPipeline
-.. _get_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_hyperparameters
-.. _hyperparameters: ../advanced_usage/hyperparameters.html
-.. _MLBlocks JSON Annotations: ../advanced_usage/primitives.html#json-annotations
-.. _get_tunable_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_tunable_hyperparameters
-.. _BTB: https://github.com/HDI-Project/BTB
-.. _set_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.set_hyperparameters
diff --git a/docs/index.rst b/docs/index.rst
index c3655b3c..7a6fa800 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,53 +1,10 @@
-What is MLBlocks?
-=================
-
-.. image:: images/mlblocks-logo.png
- :width: 300 px
- :alt: MLBlocks
- :align: center
-
-MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning
-tools developed in Python, whether they are custom developments or belong to third party
-libraries, and build Pipelines out of them that can be fitted and then used to make predictions.
-
-This is achieved by providing a simple and intuitive annotation language that allows the
-user to specify how to integrate with each tool, here called primitives, in order to provide
-a common uniform interface to each one of them.
-
-At a high level:
-
-* Each available primitive has been annotated using a standardized JSON file that specifies its
- native interface, as well as which hyperparameters can be used to tune its behavior.
-* A list of primitives that will be combined into a pipeline is provided by the user, optionally
- passing along the hyperparameters to use for each primitive.
-* An MLBlock instance is build for each primitive, offering a common interface for all of them.
-* The MLBlock instances are then combined into an MLPipeline instance, able to run them all in
- the right order, passing the output from each one as input to the next one.
-* The training data is passed to the `MLPipeline.fit` method, which sequentially fits each
- MLBlock instance following the JSON annotation specification.
-* The data used to make predictions is passed to the `MLPipeline.predict` method, which uses each
- MLBlock sequentially to obtain the desired predictions.
-
-History
--------
-
-In its first iteration in 2015, MLBlocks was designed for only multi table, multi entity temporal
-data. A good reference to see our design rationale at that time is Bryan Collazo’s thesis:
-
-* `Machine learning blocks`_.
- Bryan Collazo. Masters thesis, MIT EECS, 2015.
-
-With recent availability of a multitude of libraries and tools, we decided it was time to integrate
-them and expand the library to address other data types: images, text, graph, time series and
-integrate with deep learning libraries.
+.. include:: readme.rst
.. toctree::
- :caption: Getting Started
- :titlesonly:
+ :hidden:
+ :maxdepth: 2
- self
- getting_started/install
- getting_started/quickstart
+ Overview
.. toctree::
:caption: Advanced Usage
@@ -89,6 +46,3 @@ Indices and tables
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
-
-.. _Machine learning blocks: https://github.com/HDI-Project/mlblocks
-.. _tarball: https://github.com/HDI-Project/mlblocks/tarball/master
diff --git a/docs/readme.rst b/docs/readme.rst
new file mode 100644
index 00000000..97d49585
--- /dev/null
+++ b/docs/readme.rst
@@ -0,0 +1 @@
+.. mdinclude:: ../README.md
From 753426e5c2ec994fe8f9ca9ab928dde9380f9bf0 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sun, 16 Feb 2020 17:34:55 -0500
Subject: [PATCH 086/160] Update devel dependencies
---
.../tutorials/3. Setting MLPipeline Hyperparameters.ipynb | 4 ++--
setup.py | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
index 0914e806..725226f7 100644
--- a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
+++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# 2. Setting MLPipeline Hyperparameters\n",
+ "# Setting MLPipeline Hyperparameters\n",
"\n",
"In this short guide we will see how to modify the hyperparameters\n",
"of an MLPipeline in order to modify its behavior or performance.\n",
@@ -429,7 +429,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.6.9"
}
},
"nbformat": 4,
diff --git a/setup.py b/setup.py
index 6045c574..ddb0081e 100644
--- a/setup.py
+++ b/setup.py
@@ -92,7 +92,7 @@
],
description="Pipelines and primitives for machine learning and data science.",
extras_require={
- 'dev': development_requires + tests_require,
+ 'dev': development_requires + tests_require + examples_require,
'test': tests_require,
'examples': examples_require,
},
From cd68389890109d055d05eac3ba9aefbd6e94ad1f Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sun, 16 Feb 2020 17:36:59 -0500
Subject: [PATCH 087/160] Rename notebook
---
...eline-Copy1.ipynb => 4. Saving and Loading a Pipeline.ipynb} | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
rename examples/tutorials/{4. Saving and Loading a Pipeline-Copy1.ipynb => 4. Saving and Loading a Pipeline.ipynb} (99%)
diff --git a/examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb
similarity index 99%
rename from examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb
rename to examples/tutorials/4. Saving and Loading a Pipeline.ipynb
index f8a0a5b3..01a58cd5 100644
--- a/examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb
+++ b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb
@@ -180,7 +180,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.6.9"
}
},
"nbformat": 4,
From 6e31824e61420038e9a180c0330ab2f745dbd2a2 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sun, 16 Feb 2020 18:15:23 -0500
Subject: [PATCH 088/160] Test readme using rundoc
---
Makefile | 4 ++++
README.md | 2 +-
setup.py | 8 +++++---
tox.ini | 8 +++++++-
4 files changed, 17 insertions(+), 5 deletions(-)
diff --git a/Makefile b/Makefile
index bfc1a5f6..eb422682 100644
--- a/Makefile
+++ b/Makefile
@@ -114,6 +114,10 @@ lint-docs: ## check docs formatting with doc8 and pydocstyle
test: ## run tests quickly with the default Python
python -m pytest --cov=mlblocks
+.PHONY: test-readme
+test-readme: ## run the readme snippets
+ rundoc run --single-session python3 -t python3 README.md
+
.PHONY: test-all
test-all: ## run tests on every Python version with tox
tox -r
diff --git a/README.md b/README.md
index f3a6e3d7..3f13fec0 100644
--- a/README.md
+++ b/README.md
@@ -120,7 +120,7 @@ Micah J. Smith, Carles Sala, James Max Kanter, and Kalyan Veeramachaneni. ["The
Harnessing the ML Ecosystem for Effective System Development."](https://arxiv.org/abs/1905.08942) arXiv
Preprint 1905.08942. 2019.
-``` bibtex
+```bibtex
@article{smith2019mlbazaar,
author = {Smith, Micah J. and Sala, Carles and Kanter, James Max and Veeramachaneni, Kalyan},
title = {The Machine Learning Bazaar: Harnessing the ML Ecosystem for Effective System Development},
diff --git a/setup.py b/setup.py
index ddb0081e..b6ba498e 100644
--- a/setup.py
+++ b/setup.py
@@ -28,10 +28,12 @@
'pytest>=3.4.2',
'pytest-cov>=2.6.0',
'mlprimitives>=0.2,<0.3',
- 'urllib3>=1.20,<1.25',
- 'setuptools>=41.0.0',
+ # 'urllib3>=1.20,<1.25',
+ # 'setuptools>=41.0.0',
'numpy<1.17',
- 'python-dateutil<2.8.1,>=2.1',
+ # 'python-dateutil<2.8.1,>=2.1',
+ 'rundoc>=0.4.3',
+ 'prompt-toolkit>=2.0,<3.0',
]
diff --git a/tox.ini b/tox.ini
index 666eeab0..1b8a777e 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
[tox]
-envlist = py35, py36, lint, docs
+envlist = py35, py36, lint, docs, readme
[travis]
@@ -29,3 +29,9 @@ skipsdist = true
extras = dev
commands =
/usr/bin/env make docs
+
+
+[testenv:readme]
+skipsdist = true
+commands =
+ /usr/bin/env make test-readme
From 507564de001731915692ee698aa33eda49318b75 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sun, 16 Feb 2020 18:22:58 -0500
Subject: [PATCH 089/160] Fix dependencies
---
setup.py | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/setup.py b/setup.py
index b6ba498e..a4fcc7a3 100644
--- a/setup.py
+++ b/setup.py
@@ -28,10 +28,8 @@
'pytest>=3.4.2',
'pytest-cov>=2.6.0',
'mlprimitives>=0.2,<0.3',
- # 'urllib3>=1.20,<1.25',
- # 'setuptools>=41.0.0',
+ 'setuptools>=41.0.0',
'numpy<1.17',
- # 'python-dateutil<2.8.1,>=2.1',
'rundoc>=0.4.3',
'prompt-toolkit>=2.0,<3.0',
]
From 3169f7ac4911b272d5d23dd89edaa347298dfc71 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sun, 16 Feb 2020 18:39:53 -0500
Subject: [PATCH 090/160] Fix readme aspect in the docs
---
README.md | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 3f13fec0..793c55f5 100644
--- a/README.md
+++ b/README.md
@@ -16,12 +16,16 @@ Pipelines and Primitives for Machine Learning and Data Science.
[](https://codecov.io/gh/HDI-Project/MLBlocks)
[](https://pepy.tech/project/mlblocks)
-# Overview
+---
+
+# MLBlocks
* Free software: MIT license
* Documentation: https://HDI-Project.github.io/MLBlocks
* Homepage: https://github.com/HDI-Project/MLBlocks
+## Overview
+
MLBlocks is a simple framework for composing end-to-end tunable Machine Learning Pipelines by
seamlessly combining tools from any python library with a simple, common and uniform interface.
From dd4e7cc7a3f95792ac47f93a42f7816eb98ce1f8 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sun, 16 Feb 2020 18:42:36 -0500
Subject: [PATCH 091/160] Fix README header
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 793c55f5..3d8a02cb 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ Pipelines and Primitives for Machine Learning and Data Science.
[](https://codecov.io/gh/HDI-Project/MLBlocks)
[](https://pepy.tech/project/mlblocks)
----
+
# MLBlocks
From 9406c65f1fea6bee3441351dba93b81893a0e3f9 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sun, 16 Feb 2020 18:45:17 -0500
Subject: [PATCH 092/160] Remove misleading point
---
examples/README.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/examples/README.md b/examples/README.md
index 12131c95..d295414e 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -5,7 +5,7 @@ functionaliry.
Within this folder you will find:
-* `examples.py`: Simple Python code examples of a class and a function based primitive implementation.
+
* `primitives`: Example primitive JSONs to demonstrate different MLBlocks functionalities.
* `pipelines`: Example pipeline JSONs to demonstrate different MLBlocks functionalities.
* `tutorials`: Collection of Jupyter Notebooks to show the usage of different MLBlocks functionalities.
@@ -45,7 +45,7 @@ cd MLBlocks
make install-examples
```
-This will install [MLBLocks](https://github.com/HDI-Project/MLBlocks.git) and also [MLPrimitives](
+This will install [MLBLocks](https://github.com/HDI-Project/MLBlocks.git) as well as [MLPrimitives](
https://github.com/HDI-Project/MLPrimitives.git) and [Jupyter](https://jupyter.org/).
4. Enter the `examples` folder and start a Jupyter Notebook:
From e48362685ce9355e4775490e93208e31e2c6278a Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sun, 16 Feb 2020 18:54:24 -0500
Subject: [PATCH 093/160] Fix link
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 3d8a02cb..3d3e21cd 100644
--- a/README.md
+++ b/README.md
@@ -111,7 +111,7 @@ the pipelines using JSON annotations or build complex multi-branched pipelines,
check our [documentation site](https://HDI-Project.github.io/MLBlocks).
Also do not forget to have a look at the [notebook tutorials](
-https://github.com/D3-AI/GreenGuard/tree/master/examples/tutorials)!
+https://github.com/HDI-Project/MLBlocks/tree/master/examples/tutorials)!
# Citing MLBlocks
From 60b5e425e844ee49dd1d6bd0b63e758cab0bbc6e Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 17 Feb 2020 11:17:00 -0500
Subject: [PATCH 094/160] Improve docs quickstart and introduction
---
README.md | 2 +-
docs/index.rst | 73 ++++++++++++++++++++++++++++++++++++++++++++++---
docs/readme.rst | 1 -
3 files changed, 70 insertions(+), 6 deletions(-)
delete mode 100644 docs/readme.rst
diff --git a/README.md b/README.md
index 3d3e21cd..0f54b440 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Pipelines and Primitives for Machine Learning and Data Science.
# MLBlocks
-* Free software: MIT license
+* Free software: [MIT license](https://github.com/HDI-Project/MLBlocks/blob/master/LICENSE)
* Documentation: https://HDI-Project.github.io/MLBlocks
* Homepage: https://github.com/HDI-Project/MLBlocks
diff --git a/docs/index.rst b/docs/index.rst
index 7a6fa800..e891230c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,10 +1,70 @@
-.. include:: readme.rst
+What is MLBlocks?
+=================
+
+.. image:: images/mlblocks-logo.png
+ :width: 300 px
+ :alt: MLBlocks
+ :align: center
+
+* Free software: `MIT license `_
+* Documentation: https://HDI-Project.github.io/MLBlocks
+* Homepage: https://github.com/HDI-Project/MLBlocks
+
+MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning
+tools developed in Python, whether they are custom developments or belong to third party
+libraries, and build Pipelines out of them that can be fitted and then used to make predictions.
+
+This is achieved by providing a simple and intuitive annotation language that allows the
+user to specify how to integrate with each tool, here called primitives, in order to provide
+a common uniform interface to each one of them.
+
+At a high level:
+
+* Each available primitive has been annotated using a standardized JSON file that specifies its
+ native interface, as well as which hyperparameters can be used to tune its behavior.
+* A list of primitives that will be combined into a pipeline is provided by the user, optionally
+ passing along the hyperparameters to use for each primitive.
+* An MLBlock instance is build for each primitive, offering a common interface for all of them.
+* The MLBlock instances are then combined into an MLPipeline instance, able to run them all in
+ the right order, passing the output from each one as input to the next one.
+* The training data is passed to the `MLPipeline.fit` method, which sequentially fits each
+ MLBlock instance following the JSON annotation specification.
+* The data used to make predictions is passed to the `MLPipeline.predict` method, which uses each
+ MLBlock sequentially to obtain the desired predictions.
+
+History
+-------
+
+In its first iteration, in 2015, MLBlocks was designed for only multi table, multi entity temporal
+data. A good reference to see our design rationale at that time is Bryan Collazo’s thesis, written
+under the supervision of Kalyan Veeramachaneni:
+
+* `Machine learning blocks`_.
+ Bryan Collazo. Masters thesis, MIT EECS, 2015.
+
+In 2018, with recent availability of a multitude of libraries and tools, we decided it was time to
+integrate them and expand the library to address other data types, like images, text, graph or
+time series, as well as introduce the usage of deep learning libraries. A second iteration of our
+work was then started by the hand of William Xue:
+
+* `A Flexible Framework for Composing End to End Machine Learning Pipelines`_.
+ William Xue. Masters thesis, MIT EECS, 2018.
+
+Later in 2018, Carles Sala joined the project to make it grow as a reliable open-source library
+that would become part of a bigger software ecosystem designed to facilitate the development of
+robust end-to-end solutions based on Machine Learning tools. This third iteration of our work
+was presented in 2019 as part of the Machine Learning Bazaar:
+
+* `The Machine Learning Bazaar: Harnessing the ML Ecosystem for Effective System Development`_.
+ Micah J. Smith, Carles Sala, James Max Kanter, and Kalyan Veeramachaneni. Sigmod 2020.
.. toctree::
- :hidden:
- :maxdepth: 2
+ :caption: Getting Started
+ :titlesonly:
- Overview
+ self
+ getting_started/install
+ getting_started/quickstart
.. toctree::
:caption: Advanced Usage
@@ -46,3 +106,8 @@ Indices and tables
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
+
+.. _Machine learning blocks: https://dai.lids.mit.edu/wp-content/uploads/2018/06/Mlblocks_Bryan.pdf
+
+.. _A Flexible Framework for Composing End to End Machine Learning Pipelines: https://dai.lids.mit.edu/wp-content/uploads/2018/12/William_MEng.pdf
+.. _The Machine Learning Bazaar\: Harnessing the ML Ecosystem for Effective System Development: https://arxiv.org/abs/1905.08942
diff --git a/docs/readme.rst b/docs/readme.rst
deleted file mode 100644
index 97d49585..00000000
--- a/docs/readme.rst
+++ /dev/null
@@ -1 +0,0 @@
-.. mdinclude:: ../README.md
From be97f0597fbbcfceb2db7643550e4f45502b46a2 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 17 Feb 2020 11:58:37 -0500
Subject: [PATCH 095/160] Add missing docs
---
docs/getting_started/install.rst | 43 ++++++++++
docs/getting_started/quickstart.rst | 127 ++++++++++++++++++++++++++++
2 files changed, 170 insertions(+)
create mode 100644 docs/getting_started/install.rst
create mode 100644 docs/getting_started/quickstart.rst
diff --git a/docs/getting_started/install.rst b/docs/getting_started/install.rst
new file mode 100644
index 00000000..d2bda921
--- /dev/null
+++ b/docs/getting_started/install.rst
@@ -0,0 +1,43 @@
+.. highlight:: shell
+
+Installation
+============
+
+From PyPi
+---------
+
+The simplest and recommended way to install MLBlocks is using `pip`:
+
+.. code-block:: console
+
+ pip install mlblocks
+
+If you don't have `pip`_ installed, this `Python installation guide`_ can guide
+you through the process.
+
+.. _pip: https://pip.pypa.io
+.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
+
+Additional dependencies
+-----------------------
+
+In order to be usable, MLBlocks requires a compatible primitives library.
+
+The official library, required in order to follow the MLBlocks tutorials and documentation examples,
+is `MLPrimitives`_, which you can install with this command:
+
+.. code-block:: console
+
+ pip install mlprimitives
+
+.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives
+
+Install for development
+-----------------------
+
+If you are installing **MLBlocks** in order to modify its code, the installation must be done
+from its sources, in the editable mode, and also including some additional dependencies in
+order to be able to run the tests and build the documentation. Instructions about this process
+can be found in the `Contributing guide`_.
+
+.. _Contributing guide: ../contributing.html#get-started
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
new file mode 100644
index 00000000..b55223dd
--- /dev/null
+++ b/docs/getting_started/quickstart.rst
@@ -0,0 +1,127 @@
+Quickstart
+==========
+
+Below is a short tutorial that will show you how to get started using **MLBlocks**.
+
+In this tutorial we will learn how to:
+
+* Create a pipeline using multiple primitives
+* Obtain the list of tunable hyperparameters from the pipeline
+* Specify hyperparameters for each primitive in the pipeline
+* Fit the pipeline using training data
+* Use the pipeline to make predictions from new data
+
+.. note:: Some additional dependencies are required in order to run this Quickstart.
+ Make sure that `you have already installed them`_.
+
+Creating a pipeline
+-------------------
+
+With MLBlocks, creating a pipeline is as simple as specifying a list of primitives and passing
+them to the `MLPipeline class`_:
+
+.. ipython:: python
+
+ from mlblocks import MLPipeline
+ primitives = [
+ 'mlprimitives.custom.preprocessing.ClassEncoder',
+ 'mlprimitives.custom.feature_extraction.CategoricalEncoder',
+ 'sklearn.impute.SimpleImputer',
+ 'xgboost.XGBClassifier',
+ 'mlprimitives.custom.preprocessing.ClassDecoder'
+ ]
+ pipeline = MLPipeline(primitives)
+
+Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary and
+passing them as the ``init_params`` argument:
+
+.. ipython:: python
+
+ init_params = {
+ 'sklearn.impute.SimpleImputer': {
+ 'strategy': 'median'
+ }
+ }
+ pipeline = MLPipeline(primitives, init_params=init_params)
+
+Once the pipeline has been instantiated, we can easily see what `hyperparameters`_ have been set
+for each block, by calling the `get_hyperparameters method`_.
+
+The output of this method is a dictionary which has the name of each block as keys and
+a dictionary with the `hyperparameters`_ of the corresponding block as values.
+
+.. ipython:: python
+
+ pipeline.get_hyperparameters()
+
+Tunable Hyperparameters
+-----------------------
+
+One of the main features of `MLBlocks JSON Annotations`_ is the possibility to indicate
+the type and possible values that each primitive hyperparameter accepts.
+
+The list of possible hyperparameters and their details can easily be obtained from the pipeline
+instance by calling its `get_tunable_hyperparameters method`_.
+
+The output of this method is a dictionary that contains the list of tunable hyperparameters
+for each block in the pipeline, ready to be passed to any hyperparameter tuning library such
+as `BTB`_.
+
+.. ipython:: python
+
+ pipeline.get_tunable_hyperparameters()
+
+Setting Hyperparameters
+-----------------------
+
+Modifying the hyperparameters of an already instantiated pipeline can be done using the
+`set_hyperparameters method`_, which expects a dictionary with the same format as the returned
+by the `get_hyperparameters method`_.
+
+Note that if a subset of the hyperparameters is passed, only these will be modified, and the
+other ones will remain unmodified.
+
+.. ipython:: python
+
+ new_hyperparameters = {
+ 'xgboost.XGBClassifier#1': {
+ 'max_depth': 15
+ }
+ }
+ pipeline.set_hyperparameters(new_hyperparameters)
+ hyperparameters = pipeline.get_hyperparameters()
+ hyperparameters['xgboost.XGBClassifier#1']['max_depth']
+
+Making predictions
+------------------
+
+Once we have created the pipeline with the desired hyperparameters we can fit it
+and then use it to make predictions on new data.
+
+To do this, we first call the ``fit`` method passing the training data and the corresponding
+labels.
+
+.. ipython:: python
+
+ from mlprimitives.datasets import load_census
+ dataset = load_census()
+ X_train, X_test, y_train, y_test = dataset.get_splits(1)
+ pipeline.fit(X_train, y_train)
+
+Once we have fitted our model to our data, we can call the ``predict`` method passing new data
+to obtain predictions from the pipeline.
+
+.. ipython:: python
+
+ predictions = pipeline.predict(X_test)
+ predictions
+ dataset.score(y_test, predictions)
+
+.. _you have already installed them: install.html#additional-dependencies
+.. _MLPipeline class: ../api_reference.html#mlblocks.MLPipeline
+.. _get_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_hyperparameters
+.. _hyperparameters: ../advanced_usage/hyperparameters.html
+.. _MLBlocks JSON Annotations: ../advanced_usage/primitives.html#json-annotations
+.. _get_tunable_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_tunable_hyperparameters
+.. _BTB: https://github.com/HDI-Project/BTB
+.. _set_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.set_hyperparameters
From c7194847264d2b85e183073aafd401ea8367c8ba Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 17 Feb 2020 13:40:59 -0500
Subject: [PATCH 096/160] Update quickstart description
---
README.md | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 0f54b440..6fb2f56c 100644
--- a/README.md
+++ b/README.md
@@ -79,8 +79,10 @@ pip install mlprimitives
# Quickstart
-Below there is a short example about how to use **MLBlocks** to solve a prediction problem
-using the primitives and pipelines from [MLPrimitives](https://github.com/HDI-Project/MLPrimitives).
+Below there is a short example about how to use **MLBlocks** to solve the [Adult Census
+Dataset](https://archive.ics.uci.edu/ml/datasets/Adult) classification problem using a
+pipeline which combines primitives from [MLPrimitives](https://github.com/HDI-Project/MLPrimitives),
+[scikit-learn](https://scikit-learn.org/) and [xgboost](https://xgboost.readthedocs.io/).
```python3
from mlblocks import MLPipeline
From 78a47d6cda812406b48c71ba62cb7d5c34d74250 Mon Sep 17 00:00:00 2001
From: JDTheRipperPC
Date: Thu, 20 Feb 2020 12:10:45 +0100
Subject: [PATCH 097/160] Add Development status badge
---
README.md | 2 ++
1 file changed, 2 insertions(+)
diff --git a/README.md b/README.md
index 6fb2f56c..fa4260d5 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@
Pipelines and Primitives for Machine Learning and Data Science.
+[](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
[](https://pypi.python.org/pypi/mlblocks)
[](https://travis-ci.org/HDI-Project/MLBlocks)
[](https://codecov.io/gh/HDI-Project/MLBlocks)
@@ -21,6 +22,7 @@ Pipelines and Primitives for Machine Learning and Data Science.
# MLBlocks
* Free software: [MIT license](https://github.com/HDI-Project/MLBlocks/blob/master/LICENSE)
+* Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
* Documentation: https://HDI-Project.github.io/MLBlocks
* Homepage: https://github.com/HDI-Project/MLBlocks
From 0a9205b08c426e1e3d63fd75a0fc39c855fa176f Mon Sep 17 00:00:00 2001
From: Erica Chiu
Date: Sun, 23 Feb 2020 22:42:46 -0500
Subject: [PATCH 098/160] Add diagram
---
mlblocks/mlpipeline.py | 183 +++++++++++++++++++++++++++++++++++++++--
1 file changed, 177 insertions(+), 6 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index dcfc8a0b..051e8338 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -8,6 +8,7 @@
import warnings
from collections import Counter, OrderedDict, defaultdict
from copy import deepcopy
+from graphviz import Digraph
import numpy as np
@@ -250,8 +251,7 @@ def _get_str_output(self, output):
def get_inputs(self, fit=True):
"""Get a relation of all the input variables required by this pipeline.
- The result is a dictionary that maps each variable name with their
- specified information.
+ The result is a list contains all of the input variables.
Optionally include the fit arguments.
Args:
@@ -259,9 +259,8 @@ def get_inputs(self, fit=True):
Optional argument to include fit arguments or not. Defaults to ``True``.
Returns:
- dictionary:
- A dictionary mapping every input variable's name to a dictionary
- specifying the information corresponding to that input variable.
+ list:
+ List of dictionaries specifying all the input variables.
Each dictionary contains the entry ``name``, as
well as any other metadata that may have been included in the
pipeline inputs specification.
@@ -292,7 +291,19 @@ def get_inputs(self, fit=True):
)
inputs.update(fit_inputs)
- return inputs
+ inputs_list=[]
+ if 'X' in inputs:
+ inputs_list.append(inputs['X'])
+ del inputs['X']
+
+ if 'y' in inputs:
+ inputs_list.append(inputs['y'])
+ del inputs['y']
+
+ for input_value in inputs.values():
+ inputs_list.append(input_value)
+
+ return inputs_list
def get_outputs(self, outputs='default'):
"""Get the list of output variables that correspond to the specified outputs.
@@ -857,6 +868,166 @@ def to_dict(self):
'outputs': self.outputs,
}
+ def _get_simple_block_name(self, block_name):
+ full_name = block_name.split("#")[0]
+ simple_name = full_name.split(".")[-1]
+ return simple_name
+
+ def _get_context_name_from_variable(self, variable_name):
+ block_name = variable_name.split("#")[0]
+ rest = variable_name[len(block_name)+1:]
+ block_index = rest.split(".")[0]
+ context_name = rest[len(block_index)+1:]
+ if len(context_name) == 0:
+ raise ValueError("Invalid variable name")
+ return context_name
+
+
+ def get_diagram(self, fit=True, outputs='default', image_path=None):
+ """
+ Creates a png diagram for the pipeline, showing Pipeline Steps,
+ Pipeline Inputs and Outputs, and block inputs and outputs.
+
+ If strings are given, they can either be one of the named outputs that have
+ been specified on the pipeline definition or a full variable specification
+ following the format ``{block-name}.{variable-name}``.
+
+ Args:
+ fit (bool):
+ Optional argument to include fit arguments or not. Defaults to `True`.
+
+ outputs (str, int, or list[str or int]):
+ Single or list of output specifications.
+
+ image_path (str):
+ Optional argument for the location at which to save the file.
+ Defaults to `None`, which returns a `graphviz.Digraph` object instead of saving the file.
+
+ Returns:
+ None or `graphviz.Digraph` object:
+ * `graphviz.Digraph` contains the information about the Pipeline Diagram
+ """
+
+ diagram = Digraph(format='png')
+ diagram.attr('graph', splines='ortho')
+ diagram.attr('node', shape='box', penwidth='1')
+
+ # Blocks
+ for block_name in self.blocks.keys():
+ simple_name = self._get_simple_block_name(block_name)
+ diagram.node(block_name, simple_name)
+
+ variables = {}
+
+ # Inputs
+ inputs = self.get_inputs(fit)
+ input_variables = []
+ with diagram.subgraph(name="cluster_inputs") as cluster:
+ cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
+ cluster.attr('node', penwidth='0', fontsize='24')
+ cluster.node('Input', 'Input', fontsize='14')
+ cluster.attr('edge', penwidth='0', arrowhead='none')
+ for input_value in inputs:
+ input_name = input_value['name']
+ variables[input_name] = input_name+'_input'
+ input_variables.append(input_name)
+ cluster.node(variables[input_name], input_name)
+ cluster.edge('Input', variables[input_name])
+
+ with cluster.subgraph() as input_variables_subgraph:
+ input_variables_subgraph.attr(None, rank='same')
+ for index in range(1, len(input_variables)):
+ input_variables_subgraph.edge(variables[input_variables[index-1]],
+ variables[input_variables[index]])
+ input_variables_subgraph.attr(None, rankdir='LR')
+
+ # Outputs
+ outputs = self.get_outputs(outputs)
+ output_variables = []
+ with diagram.subgraph(name="cluster_outputs") as cluster:
+ cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
+ cluster.attr('node', penwidth='0', fontsize='24')
+ cluster.node('Output', 'Output', fontsize='14')
+ cluster.attr('edge', penwidth='0', arrowhead='none')
+ for output in outputs:
+ try:
+ variable_name = self._get_context_name_from_variable(output['variable'])
+ except ValueError:
+ raise NotImplementedError('Can not deal with this type of output specification')
+ cluster.node(variable_name+'_output', variable_name)
+ output_variables.append(variable_name)
+ cluster.edge(output_variables[-1] + '_output', 'Output')
+ with cluster.subgraph() as output_variables_subgraph:
+ output_variables_subgraph.attr(None, rank='same')
+ for index in range(1, len(output_variables)):
+ output_variables_subgraph.edge(output_variables[index-1]+'_output', output_variables[index]+'_output')
+ output_variables_subgraph.attr(None, rankdir='LR')
+
+ cluster_edges = set()
+
+ # Variables
+ diagram.attr('node', fontsize='14', penwidth='0')
+ diagram.attr('edge', penwidth='1')
+ for block_name, block in self.blocks.items():
+ # Inputs
+ input_names = self.input_names.get(block_name, dict())
+ input_variables = block.produce_args
+ if fit:
+ for input_variable in block.fit_args:
+ if input_variable not in input_variables:
+ input_variables.append(input_variable)
+ for input_variable in input_variables:
+ input_variable_name = input_variable['name']
+ if input_variable_name in input_names:
+ diagram.node(block_name+' '+input_variable_name, '('+input_variable_name+')', fontcolor='blue')
+ original_variable_name = input_names[input_variable_name]
+ diagram.edge(variables[original_variable_name], block_name+' '+input_variable_name)
+ cluster_edges.add((block_name+' '+input_variable_name, block_name))
+ else:
+ diagram.edge(variables[input_variable_name], block_name)
+
+ # Outputs
+ output_names = self.output_names.get(block_name, dict())
+ for output_variable in block.produce_output:
+ output_variable_name = output_variable['name']
+ if output_variable_name in output_names:
+ diagram.node(block_name+' '+output_variable_name, '('+output_variable_name+')', fontcolor='red')
+ cluster_edges.add((block_name, block_name+' '+output_variable_name))
+ new_variable_name = output_names[output_variable_name]
+ diagram.node(block_name+' '+new_variable_name, new_variable_name)
+ diagram.edge(block_name+' '+output_variable_name, block_name+' '+new_variable_name, arrowhead='none')
+ variables[new_variable_name] = block_name+' '+new_variable_name
+ else:
+ diagram.node(block_name+' '+output_variable_name, output_variable_name)
+ diagram.edge(block_name, block_name+' '+output_variable_name, arrowhead='none')
+ variables[output_variable_name] = block_name+' '+output_variable_name
+
+ # Connection to output variables
+ for output_variable in output_variables:
+ variable_block = variables[output_variable]
+ diagram.edge(variable_block, output_variable+'_output')
+
+ # Alignment
+ with diagram.subgraph() as alignment:
+ alignment.attr('graph', penwidth='0')
+ alignment.attr('edge', penwidth='0', arrowhead='none')
+ for index in range(1, len(self.blocks)):
+ alignment.edge(self._get_block_name(index-1), self._get_block_name(index))
+
+ # Optional names
+ alignment.attr('edge', len='1', minlen='1', penwidth='1')
+
+ for first_block, second_block in cluster_edges:
+ with alignment.subgraph(name='cluster_'+first_block+second_block) as cluster:
+ cluster.edge(first_block, second_block)
+
+ if image_path:
+ diagram.render(filename='Diagram', directory=image_path, cleanup=True, format='png')
+ else:
+ return diagram
+
+
+
def save(self, path):
"""Save the specification of this MLPipeline in a JSON file.
From 248ccd7b4bdb5f5e9783283ec92b3f0c78d88dff Mon Sep 17 00:00:00 2001
From: Erica Chiu
Date: Tue, 17 Mar 2020 11:24:48 -0700
Subject: [PATCH 099/160] Setup update
---
setup.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/setup.py b/setup.py
index 1e8ef2ad..93b78ac3 100644
--- a/setup.py
+++ b/setup.py
@@ -15,6 +15,7 @@
install_requires = [
+ 'graphviz>=0.9,<1',
]
@@ -44,7 +45,6 @@
'm2r>=0.2.0',
'Sphinx>=1.7.1',
'sphinx_rtd_theme>=0.2.4',
- 'graphviz>=0.9',
'ipython>=6.5.0',
'matplotlib>=2.2.3',
'autodocsumm>=0.1.10',
From 0fd635c249f555217e2cf5d0fa171571a558d718 Mon Sep 17 00:00:00 2001
From: Erica Chiu
Date: Sat, 23 May 2020 16:54:15 -0700
Subject: [PATCH 100/160] Fix test and lint errors
---
mlblocks/mlpipeline.py | 369 ++++++++++++++++++++++++++++++-----------
1 file changed, 268 insertions(+), 101 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 051e8338..128932f6 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -260,7 +260,7 @@ def get_inputs(self, fit=True):
Returns:
list:
- List of dictionaries specifying all the input variables.
+ Dictionary specifying all the input variables.
Each dictionary contains the entry ``name``, as
well as any other metadata that may have been included in the
pipeline inputs specification.
@@ -291,19 +291,7 @@ def get_inputs(self, fit=True):
)
inputs.update(fit_inputs)
- inputs_list=[]
- if 'X' in inputs:
- inputs_list.append(inputs['X'])
- del inputs['X']
-
- if 'y' in inputs:
- inputs_list.append(inputs['y'])
- del inputs['y']
-
- for input_value in inputs.values():
- inputs_list.append(input_value)
-
- return inputs_list
+ return inputs
def get_outputs(self, outputs='default'):
"""Get the list of output variables that correspond to the specified outputs.
@@ -869,67 +857,80 @@ def to_dict(self):
}
def _get_simple_block_name(self, block_name):
+ """
+ Gets the most readable, simplest version of the block name,
+ without the number of the block or excess modifiers.
+
+ Args:
+ block_name (str):
+ Name of the block whose simple name is being extracted.
+ Returns:
+ str:
+ block name stripped of number and other modifiers.
+ """
full_name = block_name.split("#")[0]
simple_name = full_name.split(".")[-1]
return simple_name
def _get_context_name_from_variable(self, variable_name):
+ """
+ Gets the name of the context from the given variable.
+
+ Args:
+ variable_name (str):
+ Name of the variable.
+ Returns:
+ str:
+ Name of the context of the variable.
+ """
block_name = variable_name.split("#")[0]
- rest = variable_name[len(block_name)+1:]
+ rest = variable_name[len(block_name) + 1:]
block_index = rest.split(".")[0]
- context_name = rest[len(block_index)+1:]
+ context_name = rest[len(block_index) + 1:]
if len(context_name) == 0:
raise ValueError("Invalid variable name")
return context_name
-
- def get_diagram(self, fit=True, outputs='default', image_path=None):
+ def _make_diagram_blocks(self, diagram):
"""
- Creates a png diagram for the pipeline, showing Pipeline Steps,
- Pipeline Inputs and Outputs, and block inputs and outputs.
-
- If strings are given, they can either be one of the named outputs that have
- been specified on the pipeline definition or a full variable specification
- following the format ``{block-name}.{variable-name}``.
+ Modifies the diagram to add blocks of the pipeline as visible nodes in the diagram.
Args:
- fit (bool):
- Optional argument to include fit arguments or not. Defaults to `True`.
-
- outputs (str, int, or list[str or int]):
- Single or list of output specifications.
-
- image_path (str):
- Optional argument for the location at which to save the file.
- Defaults to `None`, which returns a `graphviz.Digraph` object instead of saving the file.
-
- Returns:
- None or `graphviz.Digraph` object:
- * `graphviz.Digraph` contains the information about the Pipeline Diagram
+ diagram (Digraph):
+ Diagram to be modified.
"""
-
- diagram = Digraph(format='png')
- diagram.attr('graph', splines='ortho')
diagram.attr('node', shape='box', penwidth='1')
-
- # Blocks
for block_name in self.blocks.keys():
simple_name = self._get_simple_block_name(block_name)
diagram.node(block_name, simple_name)
- variables = {}
+ def _make_diagram_inputs(self, diagram, fit):
+ """
+ Modifies the diagram to add the inputs of the pipeline
- # Inputs
- inputs = self.get_inputs(fit)
+ Args:
+ diagram (Digraph):
+ Diagram to be modified.
+
+ fit (bool):
+ `True` if including fitted arguments, `False` otherwise.
+
+ Returns:
+ dict:
+ Dictionary of variables mapped to their label for their node in the pipeline.
+ """
+ diagram.attr('node', shape='box')
+ variables = {}
input_variables = []
+ inputs = self.get_inputs(fit)
+
with diagram.subgraph(name="cluster_inputs") as cluster:
cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
cluster.attr('node', penwidth='0', fontsize='24')
- cluster.node('Input', 'Input', fontsize='14')
cluster.attr('edge', penwidth='0', arrowhead='none')
- for input_value in inputs:
- input_name = input_value['name']
- variables[input_name] = input_name+'_input'
+ cluster.node('Input', 'Input', fontsize='14')
+ for input_name in inputs:
+ variables[input_name] = input_name + '_input'
input_variables.append(input_name)
cluster.node(variables[input_name], input_name)
cluster.edge('Input', variables[input_name])
@@ -937,97 +938,263 @@ def get_diagram(self, fit=True, outputs='default', image_path=None):
with cluster.subgraph() as input_variables_subgraph:
input_variables_subgraph.attr(None, rank='same')
for index in range(1, len(input_variables)):
- input_variables_subgraph.edge(variables[input_variables[index-1]],
- variables[input_variables[index]])
+ input_variables_subgraph.edge(
+ variables[input_variables[index - 1]],
+ variables[input_variables[index]])
input_variables_subgraph.attr(None, rankdir='LR')
- # Outputs
- outputs = self.get_outputs(outputs)
+ return variables
+
+ def _make_diagram_outputs(self, diagram, outputs):
+ """
+ Modifies the diagram to add outputs of the pipeline in order from left to right.
+
+ Args:
+ diagram (Digraph):
+ Diagram to be modified.
+
+ outputs (str, int, or list[str or int]):
+ Single or list of output specifications.
+
+ Returns:
+ list[str]:
+ List of the human-readable names of the output variables in order
+ """
+ diagram.attr('node', shape='box')
output_variables = []
+ outputs_vars = self.get_outputs(outputs)
+
with diagram.subgraph(name="cluster_outputs") as cluster:
cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
cluster.attr('node', penwidth='0', fontsize='24')
- cluster.node('Output', 'Output', fontsize='14')
cluster.attr('edge', penwidth='0', arrowhead='none')
- for output in outputs:
+ cluster.node('Output', 'Output', fontsize='14')
+ for output in outputs_vars:
try:
variable_name = self._get_context_name_from_variable(output['variable'])
except ValueError:
- raise NotImplementedError('Can not deal with this type of output specification')
- cluster.node(variable_name+'_output', variable_name)
+ raise NotImplementedError(
+ 'Can not deal with this type of output specification')
+ cluster.node(variable_name + '_output', variable_name)
output_variables.append(variable_name)
cluster.edge(output_variables[-1] + '_output', 'Output')
with cluster.subgraph() as output_variables_subgraph:
output_variables_subgraph.attr(None, rank='same')
for index in range(1, len(output_variables)):
- output_variables_subgraph.edge(output_variables[index-1]+'_output', output_variables[index]+'_output')
+ output_variables_subgraph.edge(output_variables[index - 1] + '_output',
+ output_variables[index] + '_output')
output_variables_subgraph.attr(None, rankdir='LR')
- cluster_edges = set()
+ return output_variables
- # Variables
+ def _make_diagram_variables(self, diagram, fit, variables):
+ """
+ Modifies the diagram to add main variables of the pipeline.
+
+ Args:
+ diagram (Digraph):
+ Diagram to be modified
+
+ fit (bool):
+ `True` if including fitted arguments, `False` otherwise.
+
+ variables (dict):
+ Dictionary of variables mapped to their label for their node in the pipeline.
+
+ Returns:
+ set:
+ Set of tuples of the alternative variable name and its corresponding block
+ in order
+ """
diagram.attr('node', fontsize='14', penwidth='0')
diagram.attr('edge', penwidth='1')
+ cluster_edges = set()
+
for block_name, block in self.blocks.items():
- # Inputs
- input_names = self.input_names.get(block_name, dict())
- input_variables = block.produce_args
- if fit:
- for input_variable in block.fit_args:
- if input_variable not in input_variables:
- input_variables.append(input_variable)
- for input_variable in input_variables:
- input_variable_name = input_variable['name']
- if input_variable_name in input_names:
- diagram.node(block_name+' '+input_variable_name, '('+input_variable_name+')', fontcolor='blue')
- original_variable_name = input_names[input_variable_name]
- diagram.edge(variables[original_variable_name], block_name+' '+input_variable_name)
- cluster_edges.add((block_name+' '+input_variable_name, block_name))
- else:
- diagram.edge(variables[input_variable_name], block_name)
-
- # Outputs
- output_names = self.output_names.get(block_name, dict())
- for output_variable in block.produce_output:
- output_variable_name = output_variable['name']
- if output_variable_name in output_names:
- diagram.node(block_name+' '+output_variable_name, '('+output_variable_name+')', fontcolor='red')
- cluster_edges.add((block_name, block_name+' '+output_variable_name))
- new_variable_name = output_names[output_variable_name]
- diagram.node(block_name+' '+new_variable_name, new_variable_name)
- diagram.edge(block_name+' '+output_variable_name, block_name+' '+new_variable_name, arrowhead='none')
- variables[new_variable_name] = block_name+' '+new_variable_name
- else:
- diagram.node(block_name+' '+output_variable_name, output_variable_name)
- diagram.edge(block_name, block_name+' '+output_variable_name, arrowhead='none')
- variables[output_variable_name] = block_name+' '+output_variable_name
+ self._make_diagram_variables_input_block(diagram, fit, variables, cluster_edges, block,
+ block_name)
+ self._make_diagram_variables_output_block(diagram, variables, cluster_edges, block,
+ block_name)
+ return cluster_edges
+
+ def _make_diagram_variables_input_block(self, diagram, fit, variables, cluster_edges, block,
+ block_name):
+ """
+ Modifies the diagram to add input variables the corresponding block of the pipeline.
+
+ Args:
+ diagram (Digraph):
+ Diagram to be modified
+
+ fit (bool):
+ `True` if including fitted arguments, `False` otherwise.
+
+ variables (dict):
+ Dictionary of variables mapped to their label for their node in the pipeline.
+
+ cluster_edges (set):
+ Set of tuples that may contain some alternative variable names and its
+ corresponding block in order
+
+ block (MLBlock):
+ The block to add its input variables to the diagram
+
+ block_name (str):
+ The name of the block to add its input variables to the diagram
+
+ Returns:
+ set:
+ Set of tuples of the alternative variable name and its corresponding block
+ in order
+ """
+ input_names = self.input_names.get(block_name, dict())
+ input_variables = block.produce_args
+
+ if fit:
+ for input_variable in block.fit_args:
+ if input_variable not in input_variables:
+ input_variables.append(input_variable)
+ for input_variable in input_variables:
+ input_variable_name = input_variable['name']
+ if input_variable_name in input_names:
+ diagram.node(block_name + ' ' + input_variable_name,
+ '(' + input_variable_name + ')', fontcolor='blue')
+ original_variable_name = input_names[input_variable_name]
+ diagram.edge(variables[original_variable_name],
+ block_name + ' ' + input_variable_name)
+ cluster_edges.add((block_name + ' ' + input_variable_name, block_name))
+ else:
+ diagram.edge(variables[input_variable_name], block_name)
+
+ def _make_diagram_variables_output_block(self, diagram, variables, cluster_edges, block,
+ block_name):
+ """
+ Modifies the diagram to add output variables the corresponding block of the pipeline.
+
+ Args:
+ diagram (Digraph):
+ Diagram to be modified
+
+ fit (bool):
+ `True` if including fitted arguments, `False` otherwise.
+
+ variables (dict):
+ Dictionary of variables mapped to their label for their node in the pipeline.
+
+ cluster_edges (set):
+ Set of tuples that may contain some alternative variable names and its
+ corresponding block in order
+
+ block (MLBlock):
+ The block to add its output variables to the diagram
+
+ block_name (str):
+ The name of the block to add its output variables to the diagram
+
+ Returns:
+ set:
+ Set of tuples of the alternative variable name and its corresponding block
+ in order
+ """
+ output_names = self.output_names.get(block_name, dict())
+ for output_variable in block.produce_output:
+ output_variable_name = output_variable['name']
+ if output_variable_name in output_names:
+ diagram.node(block_name + ' ' + output_variable_name,
+ '(' + output_variable_name + ')', fontcolor='red')
+ cluster_edges.add((block_name, block_name + ' ' + output_variable_name))
+ new_variable_name = output_names[output_variable_name]
+ diagram.node(block_name + ' ' + new_variable_name, new_variable_name)
+ diagram.edge(block_name + ' ' + output_variable_name,
+ block_name + ' ' + new_variable_name, arrowhead='none')
+ variables[new_variable_name] = block_name + ' ' + new_variable_name
+ else:
+ diagram.node(block_name + ' ' + output_variable_name, output_variable_name)
+ diagram.edge(block_name, block_name + ' ' + output_variable_name, arrowhead='none')
+ variables[output_variable_name] = block_name + ' ' + output_variable_name
+
+ def _make_diagram_output_connections(self, diagram, variables, output_variables):
+ """
+ Modifies the diagram to add connections to the output variables of the pipeline.
+
+ Args:
+ diagram (Digraph):
+ Diagram to be modified
- # Connection to output variables
+ variables (dict):
+ Dictionary of variables mapped to their label for their node in the pipeline.
+
+ output_variables (list[str]):
+ List of the human-readable names of the output variables in order
+ """
for output_variable in output_variables:
variable_block = variables[output_variable]
- diagram.edge(variable_block, output_variable+'_output')
+ diagram.edge(variable_block, output_variable + '_output')
+
+ def _make_diagram_alignment(self, diagram, cluster_edges):
+ """
+ Modifies the diagram to add alignment edges and connect alternative names to the blocks.
+
+ Args:
+ diagram (Digraph):
+ Diagram to be modified
- # Alignment
+ cluster_edges (set):
+ Set of tuples that contain alternative variable names and its
+ corresponding block in order
+ """
with diagram.subgraph() as alignment:
alignment.attr('graph', penwidth='0')
alignment.attr('edge', penwidth='0', arrowhead='none')
for index in range(1, len(self.blocks)):
- alignment.edge(self._get_block_name(index-1), self._get_block_name(index))
+ alignment.edge(self._get_block_name(index - 1), self._get_block_name(index))
- # Optional names
alignment.attr('edge', len='1', minlen='1', penwidth='1')
-
for first_block, second_block in cluster_edges:
- with alignment.subgraph(name='cluster_'+first_block+second_block) as cluster:
+ with alignment.subgraph(name='cluster_' + first_block + second_block) as cluster:
cluster.edge(first_block, second_block)
+ def get_diagram(self, fit=True, outputs='default', image_path=None):
+ """
+ Creates a png diagram for the pipeline, showing Pipeline Steps,
+ Pipeline Inputs and Outputs, and block inputs and outputs.
+
+ If strings are given, they can either be one of the named outputs that have
+ been specified on the pipeline definition or a full variable specification
+ following the format ``{block-name}.{variable-name}``.
+
+ Args:
+ fit (bool):
+ Optional argument to include fit arguments or not. Defaults to `True`.
+
+ outputs (str, int, or list[str or int]):
+ Single or list of output specifications.
+
+ image_path (str):
+ Optional argument for the location at which to save the file.
+ Defaults to `None`, which returns a `graphviz.Digraph` object instead of
+ saving the file.
+
+ Returns:
+ None or `graphviz.Digraph` object:
+ * `graphviz.Digraph` contains the information about the Pipeline Diagram
+ """
+
+ diagram = Digraph(format='png')
+ diagram.attr('graph', splines='ortho')
+
+ self._make_diagram_blocks(diagram)
+ variables = self._make_diagram_inputs(diagram, fit)
+ output_variables = self._make_diagram_outputs(diagram, outputs)
+ cluster_edges = self._make_diagram_variables(diagram, fit, variables)
+ self._make_diagram_output_connections(diagram, variables, output_variables)
+ self._make_diagram_alignment(diagram, cluster_edges)
+
if image_path:
diagram.render(filename='Diagram', directory=image_path, cleanup=True, format='png')
else:
return diagram
-
-
def save(self, path):
"""Save the specification of this MLPipeline in a JSON file.
From d47e339496b72fb13472fa68e4044f978a9cf0a4 Mon Sep 17 00:00:00 2001
From: Erica Chiu
Date: Tue, 26 May 2020 10:15:32 -0700
Subject: [PATCH 101/160] Fix import order
---
mlblocks/mlpipeline.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 128932f6..a96995be 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -8,9 +8,9 @@
import warnings
from collections import Counter, OrderedDict, defaultdict
from copy import deepcopy
-from graphviz import Digraph
import numpy as np
+from graphviz import Digraph
from mlblocks.discovery import load_pipeline
from mlblocks.mlblock import MLBlock
From 62c310e72b10779544d993bb172be708e8095ccd Mon Sep 17 00:00:00 2001
From: Erica Chiu
Date: Tue, 26 May 2020 11:36:11 -0700
Subject: [PATCH 102/160] Fix double arrow bug
---
mlblocks/mlpipeline.py | 32 +++++++++++++++-----------------
1 file changed, 15 insertions(+), 17 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index a96995be..48d68268 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -864,6 +864,7 @@ def _get_simple_block_name(self, block_name):
Args:
block_name (str):
Name of the block whose simple name is being extracted.
+
Returns:
str:
block name stripped of number and other modifiers.
@@ -879,6 +880,7 @@ def _get_context_name_from_variable(self, variable_name):
Args:
variable_name (str):
Name of the variable.
+
Returns:
str:
Name of the context of the variable.
@@ -926,7 +928,7 @@ def _make_diagram_inputs(self, diagram, fit):
with diagram.subgraph(name="cluster_inputs") as cluster:
cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
- cluster.attr('node', penwidth='0', fontsize='24')
+ cluster.attr('node', penwidth='0', fontsize='20')
cluster.attr('edge', penwidth='0', arrowhead='none')
cluster.node('Input', 'Input', fontsize='14')
for input_name in inputs:
@@ -966,7 +968,7 @@ def _make_diagram_outputs(self, diagram, outputs):
with diagram.subgraph(name="cluster_outputs") as cluster:
cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
- cluster.attr('node', penwidth='0', fontsize='24')
+ cluster.attr('node', penwidth='0', fontsize='20')
cluster.attr('edge', penwidth='0', arrowhead='none')
cluster.node('Output', 'Output', fontsize='14')
for output in outputs_vars:
@@ -1048,23 +1050,23 @@ def _make_diagram_variables_input_block(self, diagram, fit, variables, cluster_e
in order
"""
input_names = self.input_names.get(block_name, dict())
- input_variables = block.produce_args
+ input_variables = set(variable['name'] for variable in block.produce_args)
if fit:
for input_variable in block.fit_args:
- if input_variable not in input_variables:
- input_variables.append(input_variable)
+ if input_variable['name'] not in input_variables:
+ input_variables.add(input_variable['name'])
+
for input_variable in input_variables:
- input_variable_name = input_variable['name']
- if input_variable_name in input_names:
- diagram.node(block_name + ' ' + input_variable_name,
- '(' + input_variable_name + ')', fontcolor='blue')
- original_variable_name = input_names[input_variable_name]
+ if input_variable in input_names:
+ diagram.node(block_name + ' ' + input_variable,
+ '(' + input_variable + ')', fontcolor='blue')
+ original_variable_name = input_names[input_variable]
diagram.edge(variables[original_variable_name],
- block_name + ' ' + input_variable_name)
- cluster_edges.add((block_name + ' ' + input_variable_name, block_name))
+ block_name + ' ' + input_variable)
+ cluster_edges.add((block_name + ' ' + input_variable, block_name))
else:
- diagram.edge(variables[input_variable_name], block_name)
+ diagram.edge(variables[input_variable], block_name)
def _make_diagram_variables_output_block(self, diagram, variables, cluster_edges, block,
block_name):
@@ -1145,10 +1147,6 @@ def _make_diagram_alignment(self, diagram, cluster_edges):
"""
with diagram.subgraph() as alignment:
alignment.attr('graph', penwidth='0')
- alignment.attr('edge', penwidth='0', arrowhead='none')
- for index in range(1, len(self.blocks)):
- alignment.edge(self._get_block_name(index - 1), self._get_block_name(index))
-
alignment.attr('edge', len='1', minlen='1', penwidth='1')
for first_block, second_block in cluster_edges:
with alignment.subgraph(name='cluster_' + first_block + second_block) as cluster:
From a3394b55aa505771a229676991541ea66cc4c226 Mon Sep 17 00:00:00 2001
From: Erica Chiu
Date: Tue, 26 May 2020 17:06:01 -0700
Subject: [PATCH 103/160] Edit tooltips
---
mlblocks/mlpipeline.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 48d68268..81c3fc19 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -927,10 +927,11 @@ def _make_diagram_inputs(self, diagram, fit):
inputs = self.get_inputs(fit)
with diagram.subgraph(name="cluster_inputs") as cluster:
+ cluster.attr(tooltip='Input variables')
cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
cluster.attr('node', penwidth='0', fontsize='20')
cluster.attr('edge', penwidth='0', arrowhead='none')
- cluster.node('Input', 'Input', fontsize='14')
+ cluster.node('Input', 'Input', fontsize='14', tooltip='Input variables')
for input_name in inputs:
variables[input_name] = input_name + '_input'
input_variables.append(input_name)
@@ -967,10 +968,11 @@ def _make_diagram_outputs(self, diagram, outputs):
outputs_vars = self.get_outputs(outputs)
with diagram.subgraph(name="cluster_outputs") as cluster:
+ cluster.attr(tooltip='Output variables')
cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
cluster.attr('node', penwidth='0', fontsize='20')
cluster.attr('edge', penwidth='0', arrowhead='none')
- cluster.node('Output', 'Output', fontsize='14')
+ cluster.node('Output', 'Output', fontsize='14', tooltip='Output variables')
for output in outputs_vars:
try:
variable_name = self._get_context_name_from_variable(output['variable'])
@@ -1180,6 +1182,8 @@ def get_diagram(self, fit=True, outputs='default', image_path=None):
diagram = Digraph(format='png')
diagram.attr('graph', splines='ortho')
+ diagram.attr(tooltip=' ') # hack to remove extraneous tooltips on edges
+ diagram.attr('edge', tooltip=' ')
self._make_diagram_blocks(diagram)
variables = self._make_diagram_inputs(diagram, fit)
From aabab78e56b60216330ec460b8c3a3dcc040aa30 Mon Sep 17 00:00:00 2001
From: Erica Chiu
Date: Wed, 27 May 2020 14:08:46 -0700
Subject: [PATCH 104/160] Fix bug with repetitive variable node names
---
mlblocks/mlpipeline.py | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 81c3fc19..af00e34b 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -1061,12 +1061,13 @@ def _make_diagram_variables_input_block(self, diagram, fit, variables, cluster_e
for input_variable in input_variables:
if input_variable in input_names:
- diagram.node(block_name + ' ' + input_variable,
+ input_variable_label = block_name + ' ' + input_variable + ' (input)'
+ diagram.node(input_variable_label,
'(' + input_variable + ')', fontcolor='blue')
original_variable_name = input_names[input_variable]
diagram.edge(variables[original_variable_name],
- block_name + ' ' + input_variable)
- cluster_edges.add((block_name + ' ' + input_variable, block_name))
+ input_variable_label)
+ cluster_edges.add((input_variable_label, block_name))
else:
diagram.edge(variables[input_variable], block_name)
@@ -1104,18 +1105,20 @@ def _make_diagram_variables_output_block(self, diagram, variables, cluster_edges
for output_variable in block.produce_output:
output_variable_name = output_variable['name']
if output_variable_name in output_names:
- diagram.node(block_name + ' ' + output_variable_name,
+ output_variable_label = block_name + ' ' + output_variable_name + ' (output)'
+ diagram.node(output_variable_label,
'(' + output_variable_name + ')', fontcolor='red')
- cluster_edges.add((block_name, block_name + ' ' + output_variable_name))
+ cluster_edges.add((block_name, output_variable_label))
new_variable_name = output_names[output_variable_name]
diagram.node(block_name + ' ' + new_variable_name, new_variable_name)
- diagram.edge(block_name + ' ' + output_variable_name,
+ diagram.edge(output_variable_label,
block_name + ' ' + new_variable_name, arrowhead='none')
variables[new_variable_name] = block_name + ' ' + new_variable_name
else:
- diagram.node(block_name + ' ' + output_variable_name, output_variable_name)
- diagram.edge(block_name, block_name + ' ' + output_variable_name, arrowhead='none')
- variables[output_variable_name] = block_name + ' ' + output_variable_name
+ output_variable_label = block_name + ' ' + output_variable_name
+ diagram.node(output_variable_label, output_variable_name)
+ diagram.edge(block_name, output_variable_label, arrowhead='none')
+ variables[output_variable_name] = output_variable_label
def _make_diagram_output_connections(self, diagram, variables, output_variables):
"""
From ca2c973583942a1dbab72ec96f1013e4069d6995 Mon Sep 17 00:00:00 2001
From: Erica Chiu
Date: Wed, 27 May 2020 22:27:44 -0700
Subject: [PATCH 105/160] Remove unnecessary nodes and edges from diagram
---
mlblocks/mlpipeline.py | 339 ++++++++++++++++++++---------------------
1 file changed, 162 insertions(+), 177 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index af00e34b..6d2738ba 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -893,22 +893,57 @@ def _get_context_name_from_variable(self, variable_name):
raise ValueError("Invalid variable name")
return context_name
- def _make_diagram_blocks(self, diagram):
+ def _get_relevant_output_variables(self, block_name, block, current_output_variables):
"""
- Modifies the diagram to add blocks of the pipeline as visible nodes in the diagram.
+ Gets the output variables of the given block that are in a given set of output variables
+
+ Args:
+ block_name (str):
+ The name of the block from which the variables are outputted
+ block (MLBlock):
+ The block from which the variables are outputted
+
+ current_output_variables (list):
+ A list of possible output variables to return
+
+ Returns:
+ set:
+ A set of strings containing the output variable name if and only if it is an
+ output variable of the given block and its name is in the list of possible
+ output variables
+ """
+ output_alt_names = self.output_names.get(block_name, dict())
+ relevant_output = set()
+ for block_output in block.produce_output:
+ output_variable_name = block_output['name']
+ if output_variable_name in output_alt_names.keys():
+ output_variable_name = output_alt_names[output_variable_name]
+
+ if output_variable_name in current_output_variables:
+ relevant_output.add(block_output['name'])
+
+ return relevant_output
+
+ def _make_diagram_block(self, diagram, block_name):
+ """
+ Modifies the diagram to add the corresponding block of the pipeline as a visible node in
+ the diagram.
Args:
diagram (Digraph):
Diagram to be modified.
+
+ block_name (str):
+ Name of block to be added to the diagram
"""
- diagram.attr('node', shape='box', penwidth='1')
- for block_name in self.blocks.keys():
- simple_name = self._get_simple_block_name(block_name)
- diagram.node(block_name, simple_name)
+ simple_name = self._get_simple_block_name(block_name)
+ diagram.node(block_name, simple_name, penwidth='1')
- def _make_diagram_inputs(self, diagram, fit):
+ def _make_block_inputs(self, diagram, fit, block_name, block, cluster_edges, variable_blocks):
"""
- Modifies the diagram to add the inputs of the pipeline
+ Modifies the diagram to add the corresponding input variables to the corresponding block
+ and their edges as outputs to other blocks by modifying `variable_blocks`. Additionally
+ modifies a set of edges to add any edges between an alternative input name and this block.
Args:
diagram (Digraph):
@@ -917,37 +952,120 @@ def _make_diagram_inputs(self, diagram, fit):
fit (bool):
`True` if including fitted arguments, `False` otherwise.
- Returns:
- dict:
- Dictionary of variables mapped to their label for their node in the pipeline.
+ block_name (str):
+ Name of block whose input variables are to be added to the diagram
+
+ block (MLBlock):
+ Block whose input variables are to be added to the diagram
+
+ cluster_edges (set):
+ Set of edges between alternative variable names and their corresponding block
+
+ variable_blocks (dict):
+ Dictionary of variable names and the set of blocks into which the variable connects
"""
- diagram.attr('node', shape='box')
- variables = {}
- input_variables = []
- inputs = self.get_inputs(fit)
+ input_alt_names = self.input_names.get(block_name, dict())
+ input_variables = set(variable['name'] for variable in block.produce_args)
+
+ if fit:
+ for input_variable in block.fit_args:
+ if input_variable['name'] not in input_variables:
+ input_variables.add(input_variable['name'])
+ for input_name in input_variables:
+ input_block = block_name
+ if input_name in input_alt_names:
+ input_variable_label = block_name + ' ' + input_name + ' (input)'
+ diagram.node(input_variable_label,
+ '(' + input_name + ')', fontcolor='blue')
+ cluster_edges.add((input_variable_label, block_name))
+ input_name = input_alt_names[input_name]
+ input_block = input_variable_label
+
+ if input_name in variable_blocks.keys():
+ variable_blocks[input_name].add(input_block)
+ else:
+ variable_blocks[input_name] = {input_block}
+
+ def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges,
+ variable_blocks):
+ """
+ Modifies the diagram to add the corresponding output variables to the corresponding block
+ and their edges as inputs to other blocks, as well as updating `variable_blocks`.
+ Additionally modifies a set of edges to add any edges between an alternative output name
+ and this block.
+
+ Args:
+ diagram (Digraph):
+ Diagram to be modified.
+
+ block_name (str):
+ Name of block whose output variables are to be added to the diagram
+
+ output_names (set):
+ Set of output variable names to be added to the diagram
+
+ cluster_edges (set):
+ Set of edges between alternative variable names and their corresponding block
+
+ variable_blocks (dict):
+ Dictionary of variable names and the set of blocks into which the variable connects
+ """
+ output_alt_names = self.output_names.get(block_name, dict())
+ for output_name in output_names:
+ output_block = block_name
+ if output_name in output_alt_names.keys():
+ alt_variable_label = block_name + ' ' + output_name + ' (output)'
+ diagram.node(alt_variable_label,
+ '(' + output_name + ')', fontcolor='red')
+ cluster_edges.add((block_name, alt_variable_label))
+ output_name = output_alt_names[output_name]
+ output_block = alt_variable_label
+
+ output_variable_label = block_name + ' ' + output_name
+ diagram.node(output_variable_label, output_name)
+ diagram.edge(output_block, output_variable_label, arrowhead='none')
+
+ for block in variable_blocks[output_name]:
+ diagram.edge(output_variable_label, block)
+
+ del variable_blocks[output_name]
+
+ def _make_diagram_inputs(self, diagram, input_variables_blocks):
+ """
+ Modifies the diagram to add the inputs of the pipeline
+
+ Args:
+ diagram (Digraph):
+ Diagram to be modified.
+
+ input_variables_blocks (dict):
+ Dictionary of input variables of the pipeline and the set of blocks where the
+ corresponding variable is an input
+ """
with diagram.subgraph(name="cluster_inputs") as cluster:
cluster.attr(tooltip='Input variables')
cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
cluster.attr('node', penwidth='0', fontsize='20')
cluster.attr('edge', penwidth='0', arrowhead='none')
cluster.node('Input', 'Input', fontsize='14', tooltip='Input variables')
- for input_name in inputs:
- variables[input_name] = input_name + '_input'
- input_variables.append(input_name)
- cluster.node(variables[input_name], input_name)
- cluster.edge('Input', variables[input_name])
+ input_variables = []
+ for input_name, blocks in input_variables_blocks.items():
+ input_name_label = input_name + '_input'
+ cluster.node(input_name_label, input_name)
+ cluster.edge('Input', input_name_label)
+ input_variables.append(input_name_label)
+
+ for block in blocks:
+ diagram.edge(input_name_label, block, pendwith='1')
with cluster.subgraph() as input_variables_subgraph:
input_variables_subgraph.attr(None, rank='same')
for index in range(1, len(input_variables)):
- input_variables_subgraph.edge(
- variables[input_variables[index - 1]],
- variables[input_variables[index]])
+ input_variables_subgraph.edge(input_variables[index - 1],
+ input_variables[index])
input_variables_subgraph.attr(None, rankdir='LR')
- return variables
-
def _make_diagram_outputs(self, diagram, outputs):
"""
Modifies the diagram to add outputs of the pipeline in order from left to right.
@@ -963,7 +1081,6 @@ def _make_diagram_outputs(self, diagram, outputs):
list[str]:
List of the human-readable names of the output variables in order
"""
- diagram.attr('node', shape='box')
output_variables = []
outputs_vars = self.get_outputs(outputs)
@@ -991,153 +1108,6 @@ def _make_diagram_outputs(self, diagram, outputs):
return output_variables
- def _make_diagram_variables(self, diagram, fit, variables):
- """
- Modifies the diagram to add main variables of the pipeline.
-
- Args:
- diagram (Digraph):
- Diagram to be modified
-
- fit (bool):
- `True` if including fitted arguments, `False` otherwise.
-
- variables (dict):
- Dictionary of variables mapped to their label for their node in the pipeline.
-
- Returns:
- set:
- Set of tuples of the alternative variable name and its corresponding block
- in order
- """
- diagram.attr('node', fontsize='14', penwidth='0')
- diagram.attr('edge', penwidth='1')
- cluster_edges = set()
-
- for block_name, block in self.blocks.items():
- self._make_diagram_variables_input_block(diagram, fit, variables, cluster_edges, block,
- block_name)
- self._make_diagram_variables_output_block(diagram, variables, cluster_edges, block,
- block_name)
- return cluster_edges
-
- def _make_diagram_variables_input_block(self, diagram, fit, variables, cluster_edges, block,
- block_name):
- """
- Modifies the diagram to add input variables the corresponding block of the pipeline.
-
- Args:
- diagram (Digraph):
- Diagram to be modified
-
- fit (bool):
- `True` if including fitted arguments, `False` otherwise.
-
- variables (dict):
- Dictionary of variables mapped to their label for their node in the pipeline.
-
- cluster_edges (set):
- Set of tuples that may contain some alternative variable names and its
- corresponding block in order
-
- block (MLBlock):
- The block to add its input variables to the diagram
-
- block_name (str):
- The name of the block to add its input variables to the diagram
-
- Returns:
- set:
- Set of tuples of the alternative variable name and its corresponding block
- in order
- """
- input_names = self.input_names.get(block_name, dict())
- input_variables = set(variable['name'] for variable in block.produce_args)
-
- if fit:
- for input_variable in block.fit_args:
- if input_variable['name'] not in input_variables:
- input_variables.add(input_variable['name'])
-
- for input_variable in input_variables:
- if input_variable in input_names:
- input_variable_label = block_name + ' ' + input_variable + ' (input)'
- diagram.node(input_variable_label,
- '(' + input_variable + ')', fontcolor='blue')
- original_variable_name = input_names[input_variable]
- diagram.edge(variables[original_variable_name],
- input_variable_label)
- cluster_edges.add((input_variable_label, block_name))
- else:
- diagram.edge(variables[input_variable], block_name)
-
- def _make_diagram_variables_output_block(self, diagram, variables, cluster_edges, block,
- block_name):
- """
- Modifies the diagram to add output variables the corresponding block of the pipeline.
-
- Args:
- diagram (Digraph):
- Diagram to be modified
-
- fit (bool):
- `True` if including fitted arguments, `False` otherwise.
-
- variables (dict):
- Dictionary of variables mapped to their label for their node in the pipeline.
-
- cluster_edges (set):
- Set of tuples that may contain some alternative variable names and its
- corresponding block in order
-
- block (MLBlock):
- The block to add its output variables to the diagram
-
- block_name (str):
- The name of the block to add its output variables to the diagram
-
- Returns:
- set:
- Set of tuples of the alternative variable name and its corresponding block
- in order
- """
- output_names = self.output_names.get(block_name, dict())
- for output_variable in block.produce_output:
- output_variable_name = output_variable['name']
- if output_variable_name in output_names:
- output_variable_label = block_name + ' ' + output_variable_name + ' (output)'
- diagram.node(output_variable_label,
- '(' + output_variable_name + ')', fontcolor='red')
- cluster_edges.add((block_name, output_variable_label))
- new_variable_name = output_names[output_variable_name]
- diagram.node(block_name + ' ' + new_variable_name, new_variable_name)
- diagram.edge(output_variable_label,
- block_name + ' ' + new_variable_name, arrowhead='none')
- variables[new_variable_name] = block_name + ' ' + new_variable_name
- else:
- output_variable_label = block_name + ' ' + output_variable_name
- diagram.node(output_variable_label, output_variable_name)
- diagram.edge(block_name, output_variable_label, arrowhead='none')
- variables[output_variable_name] = output_variable_label
-
- def _make_diagram_output_connections(self, diagram, variables, output_variables):
- """
- Modifies the diagram to add connections to the output variables of the pipeline.
-
- Args:
- diagram (Digraph):
- Diagram to be modified
-
- variables (dict):
- Dictionary of variables mapped to their label for their node in the pipeline.
-
- output_variables (list[str]):
- List of the human-readable names of the output variables in order
- """
- for output_variable in output_variables:
- variable_block = variables[output_variable]
- diagram.edge(variable_block, output_variable + '_output')
-
def _make_diagram_alignment(self, diagram, cluster_edges):
"""
Modifies the diagram to add alignment edges and connect alternative names to the blocks.
@@ -1152,7 +1122,9 @@ def _make_diagram_alignment(self, diagram, cluster_edges):
"""
with diagram.subgraph() as alignment:
alignment.attr('graph', penwidth='0')
+ alignment.attr('node', penwidth='0')
alignment.attr('edge', len='1', minlen='1', penwidth='1')
+
for first_block, second_block in cluster_edges:
with alignment.subgraph(name='cluster_' + first_block + second_block) as cluster:
cluster.edge(first_block, second_block)
@@ -1187,12 +1159,25 @@ def get_diagram(self, fit=True, outputs='default', image_path=None):
diagram.attr('graph', splines='ortho')
diagram.attr(tooltip=' ') # hack to remove extraneous tooltips on edges
diagram.attr('edge', tooltip=' ')
+ diagram.attr('node', shape='box', penwidth='0')
- self._make_diagram_blocks(diagram)
- variables = self._make_diagram_inputs(diagram, fit)
output_variables = self._make_diagram_outputs(diagram, outputs)
- cluster_edges = self._make_diagram_variables(diagram, fit, variables)
- self._make_diagram_output_connections(diagram, variables, output_variables)
+
+ cluster_edges = set()
+ variable_blocks = dict((name, {name + '_output'}) for name in output_variables)
+ for block_name, block in reversed(self.blocks.items()):
+ relevant_output_names = self._get_relevant_output_variables(block_name, block,
+ variable_blocks.keys())
+ if len(relevant_output_names) == 0:
+ continue # skip this block
+
+ self._make_diagram_block(diagram, block_name)
+ self._make_block_outputs(diagram, block_name, relevant_output_names, cluster_edges,
+ variable_blocks)
+ self._make_block_inputs(diagram, fit, block_name, block, cluster_edges,
+ variable_blocks)
+
+ self._make_diagram_inputs(diagram, variable_blocks)
self._make_diagram_alignment(diagram, cluster_edges)
if image_path:
From 8deb6d64324842656f98968f79ae13d0e7c3c8b9 Mon Sep 17 00:00:00 2001
From: Erica Chiu
Date: Thu, 4 Jun 2020 16:56:26 -0700
Subject: [PATCH 106/160] Remove intermediate arrowheads
---
mlblocks/mlpipeline.py | 42 ++++++++++++++++++++++++------------------
1 file changed, 24 insertions(+), 18 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 6d2738ba..2465ea5f 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -900,6 +900,7 @@ def _get_relevant_output_variables(self, block_name, block, current_output_varia
Args:
block_name (str):
The name of the block from which the variables are outputted
+
block (MLBlock):
The block from which the variables are outputted
@@ -959,10 +960,12 @@ def _make_block_inputs(self, diagram, fit, block_name, block, cluster_edges, var
Block whose input variables are to be added to the diagram
cluster_edges (set):
- Set of edges between alternative variable names and their corresponding block
+ Set of tuples representing edges between alternative variable names and their
+ corresponding block and the type of arrowhead
variable_blocks (dict):
- Dictionary of variable names and the set of blocks into which the variable connects
+ Dictionary of variable names and the set of tuples of blocks into which the
+ variable connects and the type of arrowhead to use
"""
input_alt_names = self.input_names.get(block_name, dict())
input_variables = set(variable['name'] for variable in block.produce_args)
@@ -974,18 +977,20 @@ def _make_block_inputs(self, diagram, fit, block_name, block, cluster_edges, var
for input_name in input_variables:
input_block = block_name
+ arrowhead = 'normal'
if input_name in input_alt_names:
input_variable_label = block_name + ' ' + input_name + ' (input)'
diagram.node(input_variable_label,
'(' + input_name + ')', fontcolor='blue')
- cluster_edges.add((input_variable_label, block_name))
+ cluster_edges.add((input_variable_label, block_name, 'normal'))
input_name = input_alt_names[input_name]
input_block = input_variable_label
+ arrowhead = 'none'
if input_name in variable_blocks.keys():
- variable_blocks[input_name].add(input_block)
+ variable_blocks[input_name].add((input_block, arrowhead))
else:
- variable_blocks[input_name] = {input_block}
+ variable_blocks[input_name] = {(input_block, arrowhead)}
def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges,
variable_blocks):
@@ -1006,10 +1011,12 @@ def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges,
Set of output variable names to be added to the diagram
cluster_edges (set):
- Set of edges between alternative variable names and their corresponding block
+ Set of tuples representing edges between alternative variable names and their
+ corresponding block and the type of arrowhead
variable_blocks (dict):
- Dictionary of variable names and the set of blocks into which the variable connects
+ Dictionary of variable names and the set of tuples of blocks into which the
+ variable connects and the type of arrowhead to use
"""
output_alt_names = self.output_names.get(block_name, dict())
for output_name in output_names:
@@ -1018,7 +1025,7 @@ def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges,
alt_variable_label = block_name + ' ' + output_name + ' (output)'
diagram.node(alt_variable_label,
'(' + output_name + ')', fontcolor='red')
- cluster_edges.add((block_name, alt_variable_label))
+ cluster_edges.add((block_name, alt_variable_label, 'none'))
output_name = output_alt_names[output_name]
output_block = alt_variable_label
@@ -1026,8 +1033,8 @@ def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges,
diagram.node(output_variable_label, output_name)
diagram.edge(output_block, output_variable_label, arrowhead='none')
- for block in variable_blocks[output_name]:
- diagram.edge(output_variable_label, block)
+ for block, arrow in variable_blocks[output_name]:
+ diagram.edge(output_variable_label, block, arrowhead=arrow)
del variable_blocks[output_name]
@@ -1040,8 +1047,8 @@ def _make_diagram_inputs(self, diagram, input_variables_blocks):
Diagram to be modified.
input_variables_blocks (dict):
- Dictionary of input variables of the pipeline and the set of blocks where the
- corresponding variable is an input
+ Dictionary of input variables of the pipeline and the set of tuples of blocks into
+ which the variable connects and the type of arrowhead to use
"""
with diagram.subgraph(name="cluster_inputs") as cluster:
cluster.attr(tooltip='Input variables')
@@ -1056,8 +1063,8 @@ def _make_diagram_inputs(self, diagram, input_variables_blocks):
cluster.edge('Input', input_name_label)
input_variables.append(input_name_label)
- for block in blocks:
- diagram.edge(input_name_label, block, pendwith='1')
+ for block, arrow in blocks:
+ diagram.edge(input_name_label, block, pendwith='1', arrowhead=arrow)
with cluster.subgraph() as input_variables_subgraph:
input_variables_subgraph.attr(None, rank='same')
@@ -1125,9 +1132,9 @@ def _make_diagram_alignment(self, diagram, cluster_edges):
alignment.attr('node', penwidth='0')
alignment.attr('edge', len='1', minlen='1', penwidth='1')
- for first_block, second_block in cluster_edges:
+ for first_block, second_block, arrow in cluster_edges:
with alignment.subgraph(name='cluster_' + first_block + second_block) as cluster:
- cluster.edge(first_block, second_block)
+ cluster.edge(first_block, second_block, arrowhead=arrow)
def get_diagram(self, fit=True, outputs='default', image_path=None):
"""
@@ -1158,13 +1165,12 @@ def get_diagram(self, fit=True, outputs='default', image_path=None):
diagram = Digraph(format='png')
diagram.attr('graph', splines='ortho')
diagram.attr(tooltip=' ') # hack to remove extraneous tooltips on edges
- diagram.attr('edge', tooltip=' ')
diagram.attr('node', shape='box', penwidth='0')
output_variables = self._make_diagram_outputs(diagram, outputs)
cluster_edges = set()
- variable_blocks = dict((name, {name + '_output'}) for name in output_variables)
+ variable_blocks = dict((name, {(name + '_output', 'normal')}) for name in output_variables)
for block_name, block in reversed(self.blocks.items()):
relevant_output_names = self._get_relevant_output_variables(block_name, block,
variable_blocks.keys())
From ea8bb9a25e12ee13b29f985874ef15f1f032e690 Mon Sep 17 00:00:00 2001
From: Erica Chiu
Date: Thu, 4 Jun 2020 17:55:04 -0700
Subject: [PATCH 107/160] Add diagram tests
---
tests/data/diagrams/diagram_fit.txt | 40 +++++++++
.../data/diagrams/diagram_multiple_blocks.txt | 44 +++++++++
tests/data/diagrams/diagram_simple.txt | 40 +++++++++
tests/test_mlpipeline.py | 90 +++++++++++++++++++
4 files changed, 214 insertions(+)
create mode 100644 tests/data/diagrams/diagram_fit.txt
create mode 100644 tests/data/diagrams/diagram_multiple_blocks.txt
create mode 100644 tests/data/diagrams/diagram_simple.txt
diff --git a/tests/data/diagrams/diagram_fit.txt b/tests/data/diagrams/diagram_fit.txt
new file mode 100644
index 00000000..7939b5e3
--- /dev/null
+++ b/tests/data/diagrams/diagram_fit.txt
@@ -0,0 +1,40 @@
+digraph {
+ graph [splines=ortho]
+ tooltip=" "
+ node [penwidth=0 shape=box]
+ subgraph cluster_outputs {
+ tooltip="Output variables"
+ graph [bgcolor=azure3 penwidth=0 rank=source]
+ node [fontsize=20 penwidth=0]
+ edge [arrowhead=none penwidth=0]
+ Output [label=Output fontsize=14 tooltip="Output variables"]
+ output_variable_output [label=output_variable]
+ output_variable_output -> Output
+ {
+ rank=same
+ rankdir=LR
+ }
+ }
+ "a_primitive#1" [label=a_primitive penwidth=1]
+ "a_primitive#1 output_variable" [label=output_variable]
+ "a_primitive#1" -> "a_primitive#1 output_variable" [arrowhead=none]
+ "a_primitive#1 output_variable" -> output_variable_output [arrowhead=normal]
+ input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1]
+ subgraph cluster_inputs {
+ tooltip="Input variables"
+ graph [bgcolor=azure3 penwidth=0 rank=source]
+ node [fontsize=20 penwidth=0]
+ edge [arrowhead=none penwidth=0]
+ Input [label=Input fontsize=14 tooltip="Input variables"]
+ input_variable_input [label=input_variable]
+ Input -> input_variable_input
+ {
+ rank=same
+ }
+ }
+ {
+ graph [penwidth=0]
+ node [penwidth=0]
+ edge [len=1 minlen=1 penwidth=1]
+ }
+}
diff --git a/tests/data/diagrams/diagram_multiple_blocks.txt b/tests/data/diagrams/diagram_multiple_blocks.txt
new file mode 100644
index 00000000..3f43a108
--- /dev/null
+++ b/tests/data/diagrams/diagram_multiple_blocks.txt
@@ -0,0 +1,44 @@
+digraph {
+ graph [splines=ortho]
+ tooltip=" "
+ node [penwidth=0 shape=box]
+ subgraph cluster_outputs {
+ tooltip="Output variables"
+ graph [bgcolor=azure3 penwidth=0 rank=source]
+ node [fontsize=20 penwidth=0]
+ edge [arrowhead=none penwidth=0]
+ Output [label=Output fontsize=14 tooltip="Output variables"]
+ output_variable_b_output [label=output_variable_b]
+ output_variable_b_output -> Output
+ {
+ rank=same
+ rankdir=LR
+ }
+ }
+ "b_primitive#1" [label=b_primitive penwidth=1]
+ "b_primitive#1 output_variable_b" [label=output_variable_b]
+ "b_primitive#1" -> "b_primitive#1 output_variable_b" [arrowhead=none]
+ "b_primitive#1 output_variable_b" -> output_variable_b_output [arrowhead=normal]
+ "a_primitive#1" [label=a_primitive penwidth=1]
+ "a_primitive#1 output_variable_a" [label=output_variable_a]
+ "a_primitive#1" -> "a_primitive#1 output_variable_a" [arrowhead=none]
+ "a_primitive#1 output_variable_a" -> "b_primitive#1" [arrowhead=normal]
+ input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1]
+ subgraph cluster_inputs {
+ tooltip="Input variables"
+ graph [bgcolor=azure3 penwidth=0 rank=source]
+ node [fontsize=20 penwidth=0]
+ edge [arrowhead=none penwidth=0]
+ Input [label=Input fontsize=14 tooltip="Input variables"]
+ input_variable_input [label=input_variable]
+ Input -> input_variable_input
+ {
+ rank=same
+ }
+ }
+ {
+ graph [penwidth=0]
+ node [penwidth=0]
+ edge [len=1 minlen=1 penwidth=1]
+ }
+}
diff --git a/tests/data/diagrams/diagram_simple.txt b/tests/data/diagrams/diagram_simple.txt
new file mode 100644
index 00000000..7939b5e3
--- /dev/null
+++ b/tests/data/diagrams/diagram_simple.txt
@@ -0,0 +1,40 @@
+digraph {
+ graph [splines=ortho]
+ tooltip=" "
+ node [penwidth=0 shape=box]
+ subgraph cluster_outputs {
+ tooltip="Output variables"
+ graph [bgcolor=azure3 penwidth=0 rank=source]
+ node [fontsize=20 penwidth=0]
+ edge [arrowhead=none penwidth=0]
+ Output [label=Output fontsize=14 tooltip="Output variables"]
+ output_variable_output [label=output_variable]
+ output_variable_output -> Output
+ {
+ rank=same
+ rankdir=LR
+ }
+ }
+ "a_primitive#1" [label=a_primitive penwidth=1]
+ "a_primitive#1 output_variable" [label=output_variable]
+ "a_primitive#1" -> "a_primitive#1 output_variable" [arrowhead=none]
+ "a_primitive#1 output_variable" -> output_variable_output [arrowhead=normal]
+ input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1]
+ subgraph cluster_inputs {
+ tooltip="Input variables"
+ graph [bgcolor=azure3 penwidth=0 rank=source]
+ node [fontsize=20 penwidth=0]
+ edge [arrowhead=none penwidth=0]
+ Input [label=Input fontsize=14 tooltip="Input variables"]
+ input_variable_input [label=input_variable]
+ Input -> input_variable_input
+ {
+ rank=same
+ }
+ }
+ {
+ graph [penwidth=0]
+ node [penwidth=0]
+ edge [len=1 minlen=1 penwidth=1]
+ }
+}
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 340a3838..9d649ad1 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -681,6 +681,96 @@ def test_get_inputs_no_fit(self):
assert inputs == expected
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_get_diagram_simple(self):
+ f = open('tests/data/diagrams/diagram_simple.txt', 'r')
+ expected = f.read()[:-1]
+ f.close()
+
+ output = [
+ {
+ 'name': 'output_variable',
+ 'type': 'another_whatever',
+ 'variable': 'a_primitive#1.output_variable'
+ }
+ ]
+
+ pipeline = MLPipeline(['a_primitive'], outputs={'default': output})
+ pipeline.blocks['a_primitive#1'].produce_args = [
+ {
+ 'name': 'input_variable',
+ 'type': 'whatever'
+ }
+ ]
+ pipeline.blocks['a_primitive#1'].produce_output = output
+
+ assert str(pipeline.get_diagram()) == expected
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_get_diagram_fit(self):
+ f = open('tests/data/diagrams/diagram_fit.txt', 'r')
+ expected = f.read()[:-1]
+ f.close()
+
+ output = [
+ {
+ 'name': 'output_variable',
+ 'type': 'another_whatever',
+ 'variable': 'a_primitive#1.output_variable'
+ }
+ ]
+
+ pipeline = MLPipeline(['a_primitive'], outputs={'default': output})
+ pipeline.blocks['a_primitive#1'].produce_args = [
+ {
+ 'name': 'input_variable',
+ 'type': 'whatever'
+ }
+ ]
+ pipeline.blocks['a_primitive#1'].fit_args = [
+ {
+ 'name': 'input_variable',
+ 'type': 'whatever'
+ }
+ ]
+ pipeline.blocks['a_primitive#1'].produce_output = output
+
+ assert str(pipeline.get_diagram()) == expected
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_get_diagram_multiple_blocks(self):
+ f = open('tests/data/diagrams/diagram_multiple_blocks.txt', 'r')
+ expected = f.read()[:-1]
+ f.close()
+
+ first_output = [
+ {
+ 'name': 'output_variable_a',
+ 'type': 'another_whatever',
+ 'variable': 'a_primitive#1.output_variable_a'
+ }
+ ]
+ second_output = [
+ {
+ 'name': 'output_variable_b',
+ 'type': 'another_whatever',
+ 'variable': 'b_primitive#1.output_variable_b'
+ }
+ ]
+
+ pipeline = MLPipeline(['a_primitive', 'b_primitive'], outputs={'default': second_output})
+ pipeline.blocks['a_primitive#1'].produce_args = [
+ {
+ 'name': 'input_variable',
+ 'type': 'whatever'
+ }
+ ]
+ pipeline.blocks['a_primitive#1'].produce_output = first_output
+ pipeline.blocks['b_primitive#1'].produce_args = first_output
+ pipeline.blocks['b_primitive#1'].produce_output = second_output
+
+ assert str(pipeline.get_diagram()) == expected
+
def test_fit(self):
pass
From 73865035c6fac86321ea86368d515d4fed068dba Mon Sep 17 00:00:00 2001
From: sarahmish
Date: Tue, 28 Jul 2020 15:52:08 +0300
Subject: [PATCH 108/160] added dictionary to record block execution time
---
mlblocks/mlpipeline.py | 17 +++++++++++++++++
setup.py | 1 +
2 files changed, 18 insertions(+)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index dcfc8a0b..6fc789d4 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -8,6 +8,7 @@
import warnings
from collections import Counter, OrderedDict, defaultdict
from copy import deepcopy
+from datetime import datetime
import numpy as np
@@ -223,6 +224,7 @@ def __init__(self, pipeline=None, primitives=None, init_params=None,
self.set_hyperparameters(hyperparameters)
self._re_block_name = re.compile(r'(^[^#]+#\d+)(\..*)?')
+ self.time = dict()
def _get_str_output(self, output):
"""Get the outputs that correspond to the str specification."""
@@ -390,6 +392,18 @@ def get_output_variables(self, outputs='default'):
outputs = self.get_outputs(outputs)
return [output['variable'] for output in outputs]
+ def get_time(self):
+ """Get the execution time of each block.
+
+ If called before fitting the pipeline, it will return an empty dictionary.
+
+ Returns:
+ dict:
+ A dictionary containing the block names as keys and
+ the execution time in seconds as values.
+ """
+ return self.time.copy()
+
def _extract_block_name(self, variable_name):
return self._re_block_name.search(variable_name).group(1)
@@ -616,7 +630,10 @@ def _fit_block(self, block, block_name, context):
LOGGER.debug("Fitting block %s", block_name)
try:
fit_args = self._get_block_args(block_name, block.fit_args, context)
+ start = datetime.utcnow()
block.fit(**fit_args)
+ elapsed = datetime.utcnow() - start
+ self.time[block_name] = elapsed.total_seconds()
except Exception:
if self.verbose:
LOGGER.exception("Exception caught fitting MLBlock %s", block_name)
diff --git a/setup.py b/setup.py
index a4fcc7a3..56ab70cd 100644
--- a/setup.py
+++ b/setup.py
@@ -15,6 +15,7 @@
install_requires = [
+ 'Keras>=2.1.6,<2.4'
]
From d35544ed72850f6ed4224f16e1344039b1bfb2f1 Mon Sep 17 00:00:00 2001
From: sarahmish
Date: Tue, 28 Jul 2020 22:39:37 +0300
Subject: [PATCH 109/160] add debug argument for fit/predict
---
mlblocks/mlpipeline.py | 90 ++++++++++++++++++++++++++++++------------
setup.py | 1 -
2 files changed, 65 insertions(+), 26 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 6fc789d4..8e5d0629 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -224,7 +224,6 @@ def __init__(self, pipeline=None, primitives=None, init_params=None,
self.set_hyperparameters(hyperparameters)
self._re_block_name = re.compile(r'(^[^#]+#\d+)(\..*)?')
- self.time = dict()
def _get_str_output(self, output):
"""Get the outputs that correspond to the str specification."""
@@ -392,18 +391,6 @@ def get_output_variables(self, outputs='default'):
outputs = self.get_outputs(outputs)
return [output['variable'] for output in outputs]
- def get_time(self):
- """Get the execution time of each block.
-
- If called before fitting the pipeline, it will return an empty dictionary.
-
- Returns:
- dict:
- A dictionary containing the block names as keys and
- the execution time in seconds as values.
- """
- return self.time.copy()
-
def _extract_block_name(self, variable_name):
return self._re_block_name.search(variable_name).group(1)
@@ -625,7 +612,7 @@ def _update_outputs(self, variable_name, output_variables, outputs, value):
index = output_variables.index(variable_name)
outputs[index] = deepcopy(value)
- def _fit_block(self, block, block_name, context):
+ def _fit_block(self, block, block_name, context, debug=False):
"""Get the block args from the context and fit the block."""
LOGGER.debug("Fitting block %s", block_name)
try:
@@ -633,14 +620,21 @@ def _fit_block(self, block, block_name, context):
start = datetime.utcnow()
block.fit(**fit_args)
elapsed = datetime.utcnow() - start
- self.time[block_name] = elapsed.total_seconds()
+
+ if debug:
+ debug_info = {
+ "elapsed": elapsed.total_seconds(),
+ "input": fit_args
+ }
+ return debug_info
+
except Exception:
if self.verbose:
LOGGER.exception("Exception caught fitting MLBlock %s", block_name)
raise
- def _produce_block(self, block, block_name, context, output_variables, outputs):
+ def _produce_block(self, block, block_name, context, output_variables, outputs, debug=False):
"""Get the block args from the context and produce the block.
Afterwards, set the block outputs back into the context and update
@@ -649,7 +643,9 @@ def _produce_block(self, block, block_name, context, output_variables, outputs):
LOGGER.debug("Producing block %s", block_name)
try:
produce_args = self._get_block_args(block_name, block.produce_args, context)
+ start = datetime.utcnow()
block_outputs = block.produce(**produce_args)
+ elapsed = datetime.utcnow() - start
outputs_dict = self._extract_outputs(block_name, block_outputs, block.produce_output)
context.update(outputs_dict)
@@ -662,13 +658,21 @@ def _produce_block(self, block, block_name, context, output_variables, outputs):
variable_name = '{}.{}'.format(block_name, key)
self._update_outputs(variable_name, output_variables, outputs, value)
+ if debug:
+ debug_info = {
+ "elapsed": elapsed.total_seconds(),
+ "input": produce_args,
+ "output": outputs_dict
+ }
+ return debug_info
+
except Exception:
if self.verbose:
LOGGER.exception("Exception caught producing MLBlock %s", block_name)
raise
- def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
+ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
"""Fit the blocks of this pipeline.
Sequentially call the ``fit`` and the ``produce`` methods of each block,
@@ -698,6 +702,10 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
If given, the execution of the pipeline will start on the specified block,
and all the blocks before that one will be skipped.
+ debug (boolean):
+ Debug mode, if True a dictionary containing the block names as keys and
+ the execution time in seconds, input, output as values is returned.
+
**kwargs:
Any additional keyword arguments will be directly added
to the context dictionary and available for the blocks.
@@ -725,6 +733,10 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
if isinstance(start_, int):
start_ = self._get_block_name(start_)
+ debug_info = None
+ if debug:
+ debug_info = defaultdict(dict)
+
for block_name, block in self.blocks.items():
if start_:
if block_name == start_:
@@ -733,10 +745,15 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
LOGGER.debug("Skipping block %s fit", block_name)
continue
- self._fit_block(block, block_name, context)
+ out = self._fit_block(block, block_name, context, debug)
+ if debug:
+ debug_info["fit"][block_name] = out
if (block_name != self._last_block_name) or (block_name in output_blocks):
- self._produce_block(block, block_name, context, output_variables, outputs)
+ out = self._produce_block(
+ block, block_name, context, output_variables, outputs, debug)
+ if debug:
+ debug_info["produce"][block_name] = out
# We already captured the output from this block
if block_name in output_blocks:
@@ -746,15 +763,23 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
# outputs we are done.
if output_variables is not None and not output_blocks:
if len(outputs) > 1:
- return tuple(outputs)
+ result = tuple(outputs)
else:
- return outputs[0]
+ result = outputs[0]
+
+ if debug:
+ return result, debug_info
+
+ return result
+
+ if debug:
+ return debug_info
if start_:
# We skipped all the blocks up to the end
raise ValueError('Unknown block name: {}'.format(start_))
- def predict(self, X=None, output_='default', start_=None, **kwargs):
+ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs):
"""Produce predictions using the blocks of this pipeline.
Sequentially call the ``produce`` method of each block, capturing the
@@ -780,6 +805,10 @@ def predict(self, X=None, output_='default', start_=None, **kwargs):
If given, the execution of the pipeline will start on the specified block,
and all the blocks before that one will be skipped.
+ debug (boolean):
+ Debug mode, if True a dictionary containing the block names as keys and
+ the execution time in seconds, input, output as values is returned.
+
**kwargs:
Any additional keyword arguments will be directly added
to the context dictionary and available for the blocks.
@@ -798,6 +827,10 @@ def predict(self, X=None, output_='default', start_=None, **kwargs):
if isinstance(start_, int):
start_ = self._get_block_name(start_)
+ debug_info = None
+ if debug:
+ debug_info = dict()
+
for block_name, block in self.blocks.items():
if start_:
if block_name == start_:
@@ -806,7 +839,9 @@ def predict(self, X=None, output_='default', start_=None, **kwargs):
LOGGER.debug("Skipping block %s produce", block_name)
continue
- self._produce_block(block, block_name, context, output_variables, outputs)
+ out = self._produce_block(block, block_name, context, output_variables, outputs, debug)
+ if debug:
+ debug_info[block_name] = out
# We already captured the output from this block
if block_name in output_blocks:
@@ -816,9 +851,14 @@ def predict(self, X=None, output_='default', start_=None, **kwargs):
# outputs we are done.
if not output_blocks:
if len(outputs) > 1:
- return tuple(outputs)
+ result = tuple(outputs)
else:
- return outputs[0]
+ result = outputs[0]
+
+ if debug:
+ return result, debug_info
+
+ return result
if start_:
# We skipped all the blocks up to the end
diff --git a/setup.py b/setup.py
index 56ab70cd..a4fcc7a3 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,6 @@
install_requires = [
- 'Keras>=2.1.6,<2.4'
]
From f0cd86f2073e6e1c1a3efe6a0535458374bc597e Mon Sep 17 00:00:00 2001
From: sarahmish
Date: Tue, 28 Jul 2020 23:03:53 +0300
Subject: [PATCH 110/160] update mlprimitive test version
---
setup.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/setup.py b/setup.py
index a4fcc7a3..85020231 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@
tests_require = [
'pytest>=3.4.2',
'pytest-cov>=2.6.0',
- 'mlprimitives>=0.2,<0.3',
+ 'mlprimitives>=0.2.4.dev0',
'setuptools>=41.0.0',
'numpy<1.17',
'rundoc>=0.4.3',
From 2909c03289df305113eae94d41f779263d25f3f6 Mon Sep 17 00:00:00 2001
From: sarahmish
Date: Wed, 29 Jul 2020 00:46:12 +0300
Subject: [PATCH 111/160] cap sphinx
---
setup.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/setup.py b/setup.py
index 85020231..4048cbbb 100644
--- a/setup.py
+++ b/setup.py
@@ -47,8 +47,8 @@
'watchdog>=0.8.3',
# docs
- 'm2r>=0.2.0',
- 'Sphinx>=1.7.1',
+ 'm2r>=0.2.0,<0.3',
+ 'Sphinx>=1.7.1,<3',
'sphinx_rtd_theme>=0.2.4',
'graphviz>=0.9',
'ipython>=6.5.0',
From 22a955f47a60e778b50de752f232345e13aac64b Mon Sep 17 00:00:00 2001
From: sarahmish
Date: Wed, 29 Jul 2020 01:18:21 +0300
Subject: [PATCH 112/160] cap isort
---
setup.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/setup.py b/setup.py
index 4048cbbb..57c623f0 100644
--- a/setup.py
+++ b/setup.py
@@ -57,8 +57,8 @@
'docutils<0.15,>=0.10', # botocore incompatibility with 0.15
# style check
- 'flake8>=3.5.0',
- 'isort>=4.3.4',
+ 'flake8>=3.5.0,<3.8',
+ 'isort>=4.3.4<5',
# fix style issues
'autoflake>=1.2', # keep this after flake8 to avoid
From 444f301f641e03150490ade67604e4cc9a23703b Mon Sep 17 00:00:00 2001
From: sarahmish
Date: Wed, 29 Jul 2020 01:53:42 +0300
Subject: [PATCH 113/160] cap isort (properly)
---
setup.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/setup.py b/setup.py
index 57c623f0..c5cf4015 100644
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,7 @@
# style check
'flake8>=3.5.0,<3.8',
- 'isort>=4.3.4<5',
+ 'isort>=4.3.4,<5',
# fix style issues
'autoflake>=1.2', # keep this after flake8 to avoid
From e2b6eb3e0d41717579a2949598af411ddbad1a47 Mon Sep 17 00:00:00 2001
From: sarahmish
Date: Thu, 30 Jul 2020 04:15:46 +0300
Subject: [PATCH 114/160] debug dictionary passing + added debug tests
---
mlblocks/mlpipeline.py | 36 +++----
tests/test_mlpipeline.py | 198 +++++++++++++++++++++++++++++++++++++++
2 files changed, 216 insertions(+), 18 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 8e5d0629..8367b327 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -612,7 +612,7 @@ def _update_outputs(self, variable_name, output_variables, outputs, value):
index = output_variables.index(variable_name)
outputs[index] = deepcopy(value)
- def _fit_block(self, block, block_name, context, debug=False):
+ def _fit_block(self, block, block_name, context, debug=None):
"""Get the block args from the context and fit the block."""
LOGGER.debug("Fitting block %s", block_name)
try:
@@ -621,12 +621,11 @@ def _fit_block(self, block, block_name, context, debug=False):
block.fit(**fit_args)
elapsed = datetime.utcnow() - start
- if debug:
- debug_info = {
+ if debug is not None:
+ debug["fit"][block_name] = {
"elapsed": elapsed.total_seconds(),
"input": fit_args
}
- return debug_info
except Exception:
if self.verbose:
@@ -634,7 +633,7 @@ def _fit_block(self, block, block_name, context, debug=False):
raise
- def _produce_block(self, block, block_name, context, output_variables, outputs, debug=False):
+ def _produce_block(self, block, block_name, context, output_variables, outputs, debug=None):
"""Get the block args from the context and produce the block.
Afterwards, set the block outputs back into the context and update
@@ -658,13 +657,17 @@ def _produce_block(self, block, block_name, context, output_variables, outputs,
variable_name = '{}.{}'.format(block_name, key)
self._update_outputs(variable_name, output_variables, outputs, value)
- if debug:
- debug_info = {
+ if debug is not None:
+ record = {
"elapsed": elapsed.total_seconds(),
"input": produce_args,
"output": outputs_dict
}
- return debug_info
+
+ if "fit" in debug.keys():
+ debug["produce"][block_name] = record
+ else:
+ debug[block_name] = record
except Exception:
if self.verbose:
@@ -745,15 +748,11 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
LOGGER.debug("Skipping block %s fit", block_name)
continue
- out = self._fit_block(block, block_name, context, debug)
- if debug:
- debug_info["fit"][block_name] = out
+ self._fit_block(block, block_name, context, debug_info)
if (block_name != self._last_block_name) or (block_name in output_blocks):
- out = self._produce_block(
- block, block_name, context, output_variables, outputs, debug)
- if debug:
- debug_info["produce"][block_name] = out
+ self._produce_block(
+ block, block_name, context, output_variables, outputs, debug_info)
# We already captured the output from this block
if block_name in output_blocks:
@@ -839,9 +838,7 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs)
LOGGER.debug("Skipping block %s produce", block_name)
continue
- out = self._produce_block(block, block_name, context, output_variables, outputs, debug)
- if debug:
- debug_info[block_name] = out
+ self._produce_block(block, block_name, context, output_variables, outputs, debug_info)
# We already captured the output from this block
if block_name in output_blocks:
@@ -860,6 +857,9 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs)
return result
+ if debug:
+ return debug_info
+
if start_:
# We skipped all the blocks up to the end
raise ValueError('Unknown block name: {}'.format(start_))
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 340a3838..25a90edb 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -681,6 +681,204 @@ def test_get_inputs_no_fit(self):
assert inputs == expected
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_fit_no_debug(self):
+ mlpipeline = MLPipeline(['a_primitive'])
+ mlpipeline.blocks['a_primitive#1'].fit_args = [
+ {
+ 'name': 'fit_input',
+ 'type': 'whatever'
+ }
+ ]
+
+ returned = mlpipeline.fit(debug=False)
+
+ assert returned is None
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_fit_debug(self):
+ mlpipeline = MLPipeline(['a_primitive'])
+ mlpipeline.blocks['a_primitive#1'].fit_args = [
+ {
+ 'name': 'fit_input',
+ 'type': 'whatever'
+ }
+ ]
+
+ expected_return = dict()
+ expected_return["fit"] = {
+ "a_primitive#1": {
+ "elapsed": 0,
+ "input": {
+ "whatever"
+ }
+ }
+ }
+
+ returned = mlpipeline.fit(debug=True)
+
+ print(returned)
+ assert isinstance(returned, dict)
+ assert set(returned.keys()) == set(expected_return.keys()) # fit / produce
+ assert set(returned["fit"].keys()) == set(expected_return["fit"].keys()) # block name
+
+ for block_name, dictionary in expected_return["fit"].items():
+ assert set(returned["fit"][block_name].keys()) == set(dictionary.keys())
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_fit_produce_debug(self):
+ outputs = {
+ 'default': [
+ {
+ 'name': 'a_name',
+ 'variable': 'a_primitive#1.a_variable',
+ 'type': 'a_type',
+ }
+ ]
+ }
+ mlpipeline = MLPipeline(['a_primitive'], outputs=outputs)
+ mlpipeline.blocks['a_primitive#1'].fit_args = [
+ {
+ 'name': 'fit_input',
+ 'type': 'whatever'
+ }
+ ]
+
+ mlpipeline.blocks['a_primitive#1'].produce_args = [
+ {
+ 'name': 'input',
+ 'type': 'whatever'
+ }
+ ]
+
+ mlpipeline.blocks['a_primitive#1'].produce_output = [
+ {
+ 'name': 'a_name',
+ 'type': 'a_type'
+ }
+ ]
+
+ expected_return = dict()
+ expected_return["fit"] = {
+ "a_primitive#1": {
+ "elapsed": 0,
+ "input": {
+ "whatever"
+ }
+ }
+ }
+ expected_return["produce"] = {
+ "a_primitive#1": {
+ "elapsed": 0,
+ "input": {
+ "whatever"
+ },
+ "output": {
+ "whatever"
+ }
+ }
+ }
+
+ returned, debug_returned = mlpipeline.fit(output_='default', debug=True)
+
+ assert len([returned]) == len(outputs["default"])
+ assert isinstance(debug_returned, dict)
+ assert set(debug_returned.keys()) == set(expected_return.keys()) # fit / produce
+ assert set(debug_returned["fit"].keys()) == set(expected_return["fit"].keys())
+ assert set(debug_returned["produce"].keys()) == set(expected_return["produce"].keys())
+
+ for block_name, dictionary in expected_return["fit"].items():
+ assert set(debug_returned["fit"][block_name].keys()) == set(dictionary.keys())
+
+ for block_name, dictionary in expected_return["produce"].items():
+ assert set(debug_returned["produce"][block_name].keys()) == set(dictionary.keys())
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_predict_no_debug(self):
+ outputs = {
+ 'default': [
+ {
+ 'name': 'a_name',
+ 'variable': 'a_primitive#1.a_variable',
+ 'type': 'a_type',
+ },
+ {
+ 'name': 'b_name',
+ 'variable': 'a_primitive#1.b_variable',
+ 'type': 'b_type',
+ },
+ ]
+ }
+ mlpipeline = MLPipeline(['a_primitive'], outputs=outputs)
+ mlpipeline.blocks['a_primitive#1'].produce_args = [
+ {
+ 'name': 'input',
+ 'type': 'whatever'
+ }
+ ]
+
+ mlpipeline.blocks['a_primitive#1'].produce_output = [
+ {
+ 'name': 'a_name',
+ 'type': 'a_type'
+ },
+ {
+ 'name': 'b_name',
+ 'type': 'b_type'
+ }
+ ]
+
+ returned = mlpipeline.predict(debug=False)
+ assert len(returned) == len(outputs["default"])
+ for returned_output, expected_output in zip(returned, outputs["default"]):
+ assert returned_output == expected_output["variable"]
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_predict_debug(self):
+ outputs = {
+ 'default': [
+ {
+ 'name': 'a_name',
+ 'variable': 'a_primitive#1.a_variable',
+ 'type': 'a_type',
+ }
+ ]
+ }
+ mlpipeline = MLPipeline(['a_primitive'], outputs=outputs)
+ mlpipeline.blocks['a_primitive#1'].produce_args = [
+ {
+ 'name': 'input',
+ 'type': 'whatever'
+ }
+ ]
+
+ mlpipeline.blocks['a_primitive#1'].produce_output = [
+ {
+ 'name': 'a_name',
+ 'type': 'a_type'
+ }
+ ]
+
+ expected_return = dict()
+ expected_return = {
+ "a_primitive#1": {
+ "elapsed": 0,
+ "input": {
+ "whatever"
+ },
+ "output": {
+ "whatever"
+ }
+ }
+ }
+ returned, debug_returned = mlpipeline.predict(debug=True)
+ assert len([returned]) == len(outputs["default"])
+ assert isinstance(debug_returned, dict)
+ assert set(debug_returned.keys()) == set(expected_return.keys())
+
+ for block_name, dictionary in expected_return.items():
+ assert set(debug_returned[block_name].keys()) == set(dictionary.keys())
+
def test_fit(self):
pass
From 54c6698a6df91fe646071f5f971224beefa12f1a Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 16 Sep 2020 16:58:57 +0200
Subject: [PATCH 115/160] Update dependencies and tutorials
---
.travis.yml | 6 +-
Makefile | 23 +-
.../tutorials/1. Using and MLPipeline.ipynb | 2 +-
.../2. Finding and Loading a Pipeline.ipynb | 35 +-
.... Setting MLPipeline Hyperparameters.ipynb | 2 +-
...ial execution and pipeline debugging.ipynb | 2 +-
.../6. Flexible outputs specification.ipynb | 18 +-
examples/tutorials/7. Tuning a Pipeline.ipynb | 46 +-
...or the best pipeline with BTBSession.ipynb | 533 +++++++-----------
setup.py | 33 +-
tests/test_mlpipeline.py | 3 +-
tox.ini | 29 +-
12 files changed, 327 insertions(+), 405 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index 136bd690..7c63a880 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,4 +1,5 @@
# Config file for automatic testing at travis-ci.org
+dist: bionic
language: python
python:
- 3.6
@@ -6,8 +7,9 @@ python:
# Command to install dependencies
install:
- - pip install -U tox-travis codecov
- - sudo apt-get install graphviz
+ - sudo apt-get update
+ - sudo apt-get install graphviz pandoc
+ - pip install -U tox-travis codecov
# Command to run tests
script: tox
diff --git a/Makefile b/Makefile
index eb422682..6cc80705 100644
--- a/Makefile
+++ b/Makefile
@@ -110,13 +110,30 @@ lint-docs: ## check docs formatting with doc8 and pydocstyle
# TEST TARGETS
-.PHONY: test
-test: ## run tests quickly with the default Python
+.PHONY: test-unit
+test-unit: ## run tests quickly with the default Python
python -m pytest --cov=mlblocks
.PHONY: test-readme
test-readme: ## run the readme snippets
- rundoc run --single-session python3 -t python3 README.md
+ rm -rf tests/readme_test && mkdir tests/readme_test
+ cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md
+ rm -rf tests/readme_test
+
+.PHONY: test-tutorials
+test-tutorials: ## run the tutorial notebooks
+ find examples/tutorials -path "*/.ipynb_checkpoints" -prune -false -o -name "*.ipynb" -exec \
+ jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --stdout --to html {} > /dev/null \;
+
+.PHONY: test
+test: test-unit test-readme ## test everything that needs test dependencies
+
+.PHONY: check-dependencies
+check-dependencies: ## test if there are any broken dependencies
+ pip check
+
+.PHONY: test-devel
+test-devel: check-dependencies lint docs ## test everything that needs development dependencies
.PHONY: test-all
test-all: ## run tests on every Python version with tox
diff --git a/examples/tutorials/1. Using and MLPipeline.ipynb b/examples/tutorials/1. Using and MLPipeline.ipynb
index 733fb42d..dab130ea 100644
--- a/examples/tutorials/1. Using and MLPipeline.ipynb
+++ b/examples/tutorials/1. Using and MLPipeline.ipynb
@@ -625,7 +625,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.6.9"
}
},
"nbformat": 4,
diff --git a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb
index 8df76259..7f14662a 100644
--- a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb
+++ b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb
@@ -52,18 +52,20 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "['image.classification.hog.random_forest',\n",
- " 'image.classification.hog.xgboost',\n",
- " 'image.classification.resnet50.xgboost']"
+ "['image.classification.hog.rf',\n",
+ " 'image.classification.hog.xgb',\n",
+ " 'image.classification.resnet50.xgb',\n",
+ " 'keras.Sequential.SingleLayerCNNImageClassifier',\n",
+ " 'keras.Sequential.VGGCNNClassifier']"
]
},
- "execution_count": 3,
+ "execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
@@ -72,7 +74,7 @@
"from mlblocks.discovery import find_pipelines\n",
"\n",
"filters = {\n",
- " 'metadata.data_modality': 'image',\n",
+ " 'metadata.data_type': 'image',\n",
" 'metadata.task_type': 'classification',\n",
"}\n",
"\n",
@@ -89,13 +91,26 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 2,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Using TensorFlow backend.\n",
+ "2020-09-16 16:03:19,939 - WARNING - tensorflow - From /home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
+ "Instructions for updating:\n",
+ "If using Keras pass *_constraint arguments to layers.\n",
+ "2020-09-16 16:03:20,025 - WARNING - tensorflow - From /home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4070: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n",
+ "\n"
+ ]
+ }
+ ],
"source": [
"from mlblocks import MLPipeline\n",
"\n",
- "pipeline = MLPipeline('image.classification.resnet50.xgboost')"
+ "pipeline = MLPipeline('image.classification.resnet50.xgb')"
]
}
],
@@ -115,7 +130,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.6.9"
}
},
"nbformat": 4,
diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
index 725226f7..5b7944b5 100644
--- a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
+++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
@@ -204,7 +204,7 @@
" }\n",
"}\n",
"pipeline = MLPipeline(\n",
- " 'single_table.classification.categorical_encoder.xgboost',\n",
+ " primitives,\n",
" init_params=init_params\n",
")"
]
diff --git a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb
index 2e21c85b..57b2b43c 100644
--- a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb
+++ b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb
@@ -704,7 +704,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.6.9"
}
},
"nbformat": 4,
diff --git a/examples/tutorials/6. Flexible outputs specification.ipynb b/examples/tutorials/6. Flexible outputs specification.ipynb
index 3dc3686f..ca1048dd 100644
--- a/examples/tutorials/6. Flexible outputs specification.ipynb
+++ b/examples/tutorials/6. Flexible outputs specification.ipynb
@@ -380,7 +380,7 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -400,7 +400,7 @@
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@@ -418,7 +418,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@@ -439,7 +439,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -454,7 +454,7 @@
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
@@ -463,7 +463,7 @@
"(24420, 108)"
]
},
- "execution_count": 28,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -474,7 +474,7 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
@@ -483,7 +483,7 @@
"(24420, 108)"
]
},
- "execution_count": 29,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -509,7 +509,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.6.9"
}
},
"nbformat": 4,
diff --git a/examples/tutorials/7. Tuning a Pipeline.ipynb b/examples/tutorials/7. Tuning a Pipeline.ipynb
index 8dbc4366..4b6eae24 100644
--- a/examples/tutorials/7. Tuning a Pipeline.ipynb
+++ b/examples/tutorials/7. Tuning a Pipeline.ipynb
@@ -58,7 +58,7 @@
"source": [
"from mlblocks import MLPipeline\n",
"\n",
- "template = MLPipeline('single_table.classification.categorical_encoder.xgboost')"
+ "template = MLPipeline('single_table.classification.xgb')"
]
},
{
@@ -204,7 +204,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -213,7 +213,7 @@
"0.8686773872402614"
]
},
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -238,7 +238,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -261,7 +261,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@@ -284,7 +284,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -300,7 +300,7 @@
" ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}"
]
},
- "execution_count": 11,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -312,7 +312,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -337,7 +337,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
@@ -346,13 +346,15 @@
"text": [
"scoring pipeline 1\n",
"scoring pipeline 2\n",
- "New best found: 0.8722706212975673\n",
"scoring pipeline 3\n",
"scoring pipeline 4\n",
+ "New best found: 0.8642241881762839\n",
"scoring pipeline 5\n",
"scoring pipeline 6\n",
"scoring pipeline 7\n",
+ "New best found: 0.8644390957265209\n",
"scoring pipeline 8\n",
+ "New best found: 0.8679095503945804\n",
"scoring pipeline 9\n",
"scoring pipeline 10\n"
]
@@ -386,23 +388,23 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
- " 'max_labels'): 40,\n",
- " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
- " ('xgboost.XGBClassifier#1', 'n_estimators'): 119,\n",
- " ('xgboost.XGBClassifier#1', 'max_depth'): 4,\n",
- " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1971742459927317,\n",
- " ('xgboost.XGBClassifier#1', 'gamma'): 0.22575517380871246,\n",
- " ('xgboost.XGBClassifier#1', 'min_child_weight'): 4}"
+ " 'max_labels'): 39,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'most_frequent',\n",
+ " ('xgboost.XGBClassifier#1', 'n_estimators'): 70,\n",
+ " ('xgboost.XGBClassifier#1', 'max_depth'): 6,\n",
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.07406443671152008,\n",
+ " ('xgboost.XGBClassifier#1', 'gamma'): 0.9244108160038952,\n",
+ " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}"
]
},
- "execution_count": 15,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -422,7 +424,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@@ -431,7 +433,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
@@ -455,7 +457,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.6.9"
}
},
"nbformat": 4,
diff --git a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
index a1f0c0f4..1fb4d7ca 100644
--- a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
+++ b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
@@ -103,11 +103,7 @@
"source": [
"from mlblocks.discovery import find_pipelines\n",
"\n",
- "filters = {\n",
- " 'metadata.data_modality': 'single_table',\n",
- " 'metadata.task_type': 'classification'\n",
- "}\n",
- "templates = find_pipelines(filters=filters)"
+ "templates = find_pipelines('single_table.classification')"
]
},
{
@@ -118,13 +114,9 @@
{
"data": {
"text/plain": [
- "['single_table.classification.categorical_encoder.logit',\n",
- " 'single_table.classification.categorical_encoder.random_forest',\n",
- " 'single_table.classification.categorical_encoder.xgboost',\n",
- " 'single_table.classification.mlprimitives.logit',\n",
- " 'single_table.classification.mlprimitives.random_forest',\n",
- " 'single_table.classification.mlprimitives.xgboost',\n",
- " 'single_table.classification.mlprimitives_text.xgboost']"
+ "['single_table.classification',\n",
+ " 'single_table.classification.text',\n",
+ " 'single_table.classification.xgb']"
]
},
"execution_count": 4,
@@ -165,7 +157,7 @@
{
"data": {
"text/plain": [
- ""
+ ""
]
},
"execution_count": 6,
@@ -174,7 +166,7 @@
}
],
"source": [
- "templates_dict['single_table.classification.mlprimitives.xgboost']"
+ "templates_dict['single_table.classification.xgb']"
]
},
{
@@ -250,12 +242,6 @@
"text/plain": [
"{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
" 'max_labels'): {'type': 'int', 'default': 0, 'range': [0, 100]},\n",
- " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
- " 'lowercase'): {'type': 'bool', 'default': True},\n",
- " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
- " 'binary'): {'type': 'bool', 'default': True},\n",
- " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
- " 'max_features'): {'type': 'int', 'default': 1000, 'range': [1, 10000]},\n",
" ('sklearn.impute.SimpleImputer#1', 'strategy'): {'type': 'str',\n",
" 'default': 'mean',\n",
" 'values': ['mean', 'median', 'most_frequent', 'constant']},\n",
@@ -282,7 +268,7 @@
}
],
"source": [
- "tunables['single_table.classification.mlprimitives.xgboost']"
+ "tunables['single_table.classification.xgb']"
]
},
{
@@ -296,7 +282,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@@ -322,13 +308,15 @@
},
{
"cell_type": "code",
- "execution_count": 12,
- "metadata": {},
+ "execution_count": 11,
+ "metadata": {
+ "scrolled": false
+ },
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "fe9bb1cfdb2f48d4b6c8614ae1d357a1",
+ "model_id": "342fe40f08024adcb5b60eea25f49d37",
"version_major": 2,
"version_minor": 0
},
@@ -343,18 +331,98 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2020-01-23 20:16:01,059 - INFO - session - Creating Tunable instance from dict.\n",
- "2020-01-23 20:16:01,060 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.logit\n",
- "2020-01-23 20:16:03,274 - INFO - session - New optimal found: single_table.classification.categorical_encoder.logit - 0.7975185708718643\n",
- "2020-01-23 20:16:03,284 - INFO - session - Creating Tunable instance from dict.\n",
- "2020-01-23 20:16:03,285 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.random_forest\n",
- "2020-01-23 20:16:05,584 - INFO - session - Creating Tunable instance from dict.\n",
- "2020-01-23 20:16:05,585 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.xgboost\n",
- "2020-01-23 20:16:10,613 - INFO - session - New optimal found: single_table.classification.categorical_encoder.xgboost - 0.8639171383183359\n",
- "2020-01-23 20:16:10,617 - INFO - session - Creating Tunable instance from dict.\n",
- "2020-01-23 20:16:10,618 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.logit\n",
- "2020-01-23 20:16:13,090 - INFO - session - Creating Tunable instance from dict.\n",
- "2020-01-23 20:16:13,093 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.random_forest\n"
+ "2020-09-16 16:32:40,826 - INFO - btb.session - Creating Tunable instance from dict.\n",
+ "2020-09-16 16:32:40,827 - INFO - btb.session - Obtaining default configuration for single_table.classification\n",
+ "2020-09-16 16:32:46,432 - INFO - btb.session - New optimal found: single_table.classification - 0.8639171383183359\n",
+ "2020-09-16 16:32:46,435 - INFO - btb.session - Creating Tunable instance from dict.\n",
+ "2020-09-16 16:32:46,436 - INFO - btb.session - Obtaining default configuration for single_table.classification.text\n",
+ "2020-09-16 16:32:46,583 - ERROR - mlblocks.mlpipeline - Exception caught producing MLBlock mlprimitives.custom.text.TextCleaner#1\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n",
+ " return self._engine.get_loc(key)\n",
+ " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ "KeyError: 'text'\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlpipeline.py\", line 645, in _produce_block\n",
+ " block_outputs = block.produce(**produce_args)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlblock.py\", line 322, in produce\n",
+ " return getattr(self.instance, self.produce_method)(**produce_kwargs)\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/mlprimitives/custom/text.py\", line 111, in produce\n",
+ " texts = X[self.column]\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n",
+ " indexer = self.columns.get_loc(key)\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n",
+ " return self._engine.get_loc(self._maybe_cast_indexer(key))\n",
+ " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ "KeyError: 'text'\n",
+ "2020-09-16 16:32:46,586 - ERROR - btb.session - Proposal 2 - single_table.classification.text crashed with the following configuration: ('mlprimitives.custom.text.TextCleaner#1', 'lower'): True\n",
+ "('mlprimitives.custom.text.TextCleaner#1', 'accents'): True\n",
+ "('mlprimitives.custom.text.TextCleaner#1', 'stopwords'): True\n",
+ "('mlprimitives.custom.text.TextCleaner#1', 'non_alpha'): True\n",
+ "('mlprimitives.custom.text.TextCleaner#1', 'single_chars'): True\n",
+ "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'lowercase'): True\n",
+ "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'binary'): True\n",
+ "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'max_features'): 1000\n",
+ "('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'n_estimators'): 10\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'criterion'): gini\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'max_features'): None\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'max_depth'): 1\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_split'): 2\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_leaf'): 1\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'min_weight_fraction_leaf'): 0.0\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'max_leaf_nodes'): 2\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'min_impurity_decrease'): 0.0\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True\n",
+ "('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n",
+ " return self._engine.get_loc(key)\n",
+ " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ "KeyError: 'text'\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/btb/session.py\", line 336, in run\n",
+ " score = self._scorer(tunable_name, config)\n",
+ " File \"\", line 11, in cross_validate\n",
+ " pipeline.fit(X_train, y_train)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlpipeline.py\", line 754, in fit\n",
+ " block, block_name, context, output_variables, outputs, debug_info)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlpipeline.py\", line 645, in _produce_block\n",
+ " block_outputs = block.produce(**produce_args)\n",
+ " File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlblock.py\", line 322, in produce\n",
+ " return getattr(self.instance, self.produce_method)(**produce_kwargs)\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/mlprimitives/custom/text.py\", line 111, in produce\n",
+ " texts = X[self.column]\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n",
+ " indexer = self.columns.get_loc(key)\n",
+ " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n",
+ " return self._engine.get_loc(self._maybe_cast_indexer(key))\n",
+ " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+ "KeyError: 'text'\n",
+ "2020-09-16 16:32:46,587 - WARNING - btb.session - Too many errors: 1. Removing tunable single_table.classification.text\n",
+ "2020-09-16 16:32:46,589 - INFO - btb.session - Creating Tunable instance from dict.\n",
+ "2020-09-16 16:32:46,589 - INFO - btb.session - Obtaining default configuration for single_table.classification.xgb\n",
+ "2020-09-16 16:32:52,100 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+ "2020-09-16 16:33:28,900 - INFO - btb.session - New optimal found: single_table.classification - 0.8728234138413778\n",
+ "2020-09-16 16:33:28,904 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n"
]
},
{
@@ -367,20 +435,26 @@
{
"data": {
"text/plain": [
- "{'id': '51a54054874dd7a83ff0e785ffdfee3b',\n",
- " 'name': 'single_table.classification.categorical_encoder.xgboost',\n",
+ "{'id': '7e662f9b90f0e123939b7532ecc221c7',\n",
+ " 'name': 'single_table.classification',\n",
" 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
- " 'max_labels'): 0,\n",
- " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
- " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
+ " 'max_labels'): 63,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'lowercase'): True,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'binary'): True,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'max_features'): 7315,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'median',\n",
+ " ('xgboost.XGBClassifier#1', 'n_estimators'): 879,\n",
" ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
- " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
- " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n",
- " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n",
- " 'score': 0.8639171383183359}"
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23231879890615814,\n",
+ " ('xgboost.XGBClassifier#1', 'gamma'): 0.5474914147721585,\n",
+ " ('xgboost.XGBClassifier#1', 'min_child_weight'): 3},\n",
+ " 'score': 0.8728234138413778}"
]
},
- "execution_count": 12,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -411,26 +485,32 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "{'id': '51a54054874dd7a83ff0e785ffdfee3b',\n",
- " 'name': 'single_table.classification.categorical_encoder.xgboost',\n",
+ "{'id': '7e662f9b90f0e123939b7532ecc221c7',\n",
+ " 'name': 'single_table.classification',\n",
" 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
- " 'max_labels'): 0,\n",
- " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
- " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
+ " 'max_labels'): 63,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'lowercase'): True,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'binary'): True,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'max_features'): 7315,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'median',\n",
+ " ('xgboost.XGBClassifier#1', 'n_estimators'): 879,\n",
" ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
- " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
- " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n",
- " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n",
- " 'score': 0.8639171383183359}"
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23231879890615814,\n",
+ " ('xgboost.XGBClassifier#1', 'gamma'): 0.5474914147721585,\n",
+ " ('xgboost.XGBClassifier#1', 'min_child_weight'): 3},\n",
+ " 'score': 0.8728234138413778}"
]
},
- "execution_count": 14,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -455,7 +535,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 13,
"metadata": {
"scrolled": false
},
@@ -463,7 +543,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "a76ce44e1173496e99baaf7ee39a3df7",
+ "model_id": "8dd5d4626f304c279b2b368a671b6cb7",
"version_major": 2,
"version_minor": 0
},
@@ -478,219 +558,27 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2020-01-23 20:17:59,163 - INFO - session - Creating Tunable instance from dict.\n",
- "2020-01-23 20:17:59,163 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.xgboost\n",
- "2020-01-23 20:18:04,640 - INFO - session - Creating Tunable instance from dict.\n",
- "2020-01-23 20:18:04,640 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives_text.xgboost\n",
- "2020-01-23 20:18:04,779 - ERROR - mlpipeline - Exception caught producing MLBlock mlprimitives.custom.text.TextCleaner#1\n",
- "Traceback (most recent call last):\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n",
- " return self._engine.get_loc(key)\n",
- " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
- " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
- " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
- " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
- "KeyError: 'text'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 635, in _produce_block\n",
- " block_outputs = block.produce(**produce_args)\n",
- " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 322, in produce\n",
- " return getattr(self.instance, self.produce_method)(**produce_kwargs)\n",
- " File \"/home/xals/Projects/MIT/MLPrimitives/mlprimitives/custom/text.py\", line 111, in produce\n",
- " texts = X[self.column]\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n",
- " indexer = self.columns.get_loc(key)\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n",
- " return self._engine.get_loc(self._maybe_cast_indexer(key))\n",
- " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
- " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
- " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
- " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
- "KeyError: 'text'\n",
- "2020-01-23 20:18:04,799 - ERROR - session - Proposal 7 - single_table.classification.mlprimitives_text.xgboost crashed with the following configuration: ('mlprimitives.custom.text.TextCleaner#1', 'lower'): True\n",
- "('mlprimitives.custom.text.TextCleaner#1', 'accents'): True\n",
- "('mlprimitives.custom.text.TextCleaner#1', 'stopwords'): True\n",
- "('mlprimitives.custom.text.TextCleaner#1', 'non_alpha'): True\n",
- "('mlprimitives.custom.text.TextCleaner#1', 'single_chars'): True\n",
- "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'lowercase'): True\n",
- "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'binary'): True\n",
- "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'max_features'): 1000\n",
- "('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n",
- "('sklearn.ensemble.RandomForestClassifier#1', 'n_estimators'): 10\n",
- "('sklearn.ensemble.RandomForestClassifier#1', 'criterion'): gini\n",
- "('sklearn.ensemble.RandomForestClassifier#1', 'max_features'): None\n",
- "('sklearn.ensemble.RandomForestClassifier#1', 'max_depth'): 1\n",
- "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_split'): 2\n",
- "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_leaf'): 1\n",
- "('sklearn.ensemble.RandomForestClassifier#1', 'min_weight_fraction_leaf'): 0.0\n",
- "('sklearn.ensemble.RandomForestClassifier#1', 'max_leaf_nodes'): 2\n",
- "('sklearn.ensemble.RandomForestClassifier#1', 'min_impurity_decrease'): 0.0\n",
- "('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True\n",
- "('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False\n",
- "Traceback (most recent call last):\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n",
- " return self._engine.get_loc(key)\n",
- " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
- " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
- " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
- " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
- "KeyError: 'text'\n",
- "\n",
- "During handling of the above exception, another exception occurred:\n",
- "\n",
- "Traceback (most recent call last):\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n",
- " score = self.scorer(tunable_name, config)\n",
- " File \"\", line 11, in cross_validate\n",
- " pipeline.fit(X_train, y_train)\n",
- " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 722, in fit\n",
- " self._produce_block(block, block_name, context, output_variables, outputs)\n",
- " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 635, in _produce_block\n",
- " block_outputs = block.produce(**produce_args)\n",
- " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 322, in produce\n",
- " return getattr(self.instance, self.produce_method)(**produce_kwargs)\n",
- " File \"/home/xals/Projects/MIT/MLPrimitives/mlprimitives/custom/text.py\", line 111, in produce\n",
- " texts = X[self.column]\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n",
- " indexer = self.columns.get_loc(key)\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n",
- " return self._engine.get_loc(self._maybe_cast_indexer(key))\n",
- " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
- " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
- " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
- " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
- "KeyError: 'text'\n",
- "2020-01-23 20:18:04,801 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.mlprimitives_text.xgboost\n",
- "2020-01-23 20:18:04,803 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
- "2020-01-23 20:18:22,026 - INFO - session - New optimal found: single_table.classification.categorical_encoder.xgboost - 0.8687079630193402\n",
- "2020-01-23 20:18:22,031 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
- "2020-01-23 20:19:13,106 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.logit\n",
- "2020-01-23 20:19:13,334 - ERROR - mlpipeline - Exception caught fitting MLBlock sklearn.linear_model.LogisticRegression#1\n",
- "Traceback (most recent call last):\n",
- " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
- " block.fit(**fit_args)\n",
- " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
- " getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
- " solver = _check_solver(self.solver, self.penalty, self.dual)\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
- " \"got %s penalty.\" % (solver, penalty))\n",
- "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n",
- "2020-01-23 20:19:13,339 - ERROR - session - Proposal 10 - single_table.classification.categorical_encoder.logit crashed with the following configuration: ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 29\n",
- "('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n",
- "('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): False\n",
- "('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 71156\n",
- "('sklearn.linear_model.LogisticRegression#1', 'solver'): newton-cg\n",
- "('sklearn.linear_model.LogisticRegression#1', 'penalty'): l1\n",
- "('sklearn.linear_model.LogisticRegression#1', 'C'): 40.699406362214916\n",
- "('sklearn.linear_model.LogisticRegression#1', 'multi_class'): multinomial\n",
- "('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 933.5409791334005\n",
- "('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.0017748534037681438\n",
- "('sklearn.linear_model.LogisticRegression#1', 'dual'): True\n",
- "Traceback (most recent call last):\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n",
- " score = self.scorer(tunable_name, config)\n",
- " File \"\", line 11, in cross_validate\n",
- " pipeline.fit(X_train, y_train)\n",
- " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 719, in fit\n",
- " self._fit_block(block, block_name, context)\n",
- " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
- " block.fit(**fit_args)\n",
- " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
- " getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
- " solver = _check_solver(self.solver, self.penalty, self.dual)\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
- " \"got %s penalty.\" % (solver, penalty))\n",
- "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2020-01-23 20:19:13,340 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.categorical_encoder.logit\n",
- "2020-01-23 20:19:13,343 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.logit\n",
- "2020-01-23 20:19:26,076 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n",
- "2020-01-23 20:19:31,573 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n",
- "2020-01-23 20:19:34,763 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
- "2020-01-23 20:20:15,775 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
- "2020-01-23 20:21:49,655 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.logit\n",
- "2020-01-23 20:21:49,946 - ERROR - mlpipeline - Exception caught fitting MLBlock sklearn.linear_model.LogisticRegression#1\n",
- "Traceback (most recent call last):\n",
- " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
- " block.fit(**fit_args)\n",
- " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
- " getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
- " solver = _check_solver(self.solver, self.penalty, self.dual)\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
- " \"got %s penalty.\" % (solver, penalty))\n",
- "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n",
- "2020-01-23 20:21:49,948 - ERROR - session - Proposal 16 - single_table.classification.mlprimitives.logit crashed with the following configuration: ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 97\n",
- "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'lowercase'): True\n",
- "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'binary'): True\n",
- "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'max_features'): 4707\n",
- "('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n",
- "('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): True\n",
- "('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 26014\n",
- "('sklearn.linear_model.LogisticRegression#1', 'solver'): newton-cg\n",
- "('sklearn.linear_model.LogisticRegression#1', 'penalty'): l1\n",
- "('sklearn.linear_model.LogisticRegression#1', 'C'): 34.878827238511434\n",
- "('sklearn.linear_model.LogisticRegression#1', 'multi_class'): multinomial\n",
- "('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 406.1952335959628\n",
- "('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.008653762646621075\n",
- "('sklearn.linear_model.LogisticRegression#1', 'dual'): True\n",
- "Traceback (most recent call last):\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n",
- " score = self.scorer(tunable_name, config)\n",
- " File \"\", line 11, in cross_validate\n",
- " pipeline.fit(X_train, y_train)\n",
- " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 719, in fit\n",
- " self._fit_block(block, block_name, context)\n",
- " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
- " block.fit(**fit_args)\n",
- " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
- " getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
- " solver = _check_solver(self.solver, self.penalty, self.dual)\n",
- " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
- " \"got %s penalty.\" % (solver, penalty))\n",
- "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n",
- "2020-01-23 20:21:49,951 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.mlprimitives.logit\n",
- "2020-01-23 20:21:49,953 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n",
- "2020-01-23 20:22:23,153 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n",
- "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
- " warn(\"Some inputs do not have OOB scores. \"\n",
- "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
- " predictions[k].sum(axis=1)[:, np.newaxis])\n",
- "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
- " warn(\"Some inputs do not have OOB scores. \"\n",
- "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
- " predictions[k].sum(axis=1)[:, np.newaxis])\n",
- "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
- " warn(\"Some inputs do not have OOB scores. \"\n",
- "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
- " predictions[k].sum(axis=1)[:, np.newaxis])\n",
- "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
- " warn(\"Some inputs do not have OOB scores. \"\n",
- "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
- " predictions[k].sum(axis=1)[:, np.newaxis])\n",
- "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
- " warn(\"Some inputs do not have OOB scores. \"\n",
- "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
- " predictions[k].sum(axis=1)[:, np.newaxis])\n",
- "2020-01-23 20:22:24,832 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
- "2020-01-23 20:22:46,026 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
- "2020-01-23 20:22:53,670 - INFO - session - New optimal found: single_table.classification.mlprimitives.xgboost - 0.8739290413691612\n",
- "2020-01-23 20:22:53,677 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n",
- "2020-01-23 20:22:55,126 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n",
- "2020-01-23 20:23:10,345 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
- "2020-01-23 20:23:15,497 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
- "2020-01-23 20:23:28,746 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n"
+ "2020-09-16 16:34:46,679 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+ "2020-09-16 16:35:39,310 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+ "2020-09-16 16:36:53,519 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+ "2020-09-16 16:37:31,639 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+ "2020-09-16 16:37:34,254 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+ "2020-09-16 16:38:33,930 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+ "2020-09-16 16:38:46,228 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+ "2020-09-16 16:39:09,193 - INFO - btb.session - New optimal found: single_table.classification - 0.8730998313333643\n",
+ "2020-09-16 16:39:09,199 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+ "2020-09-16 16:40:06,793 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+ "2020-09-16 16:40:44,917 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+ "2020-09-16 16:41:19,357 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+ "2020-09-16 16:41:29,076 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+ "2020-09-16 16:41:46,742 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+ "2020-09-16 16:42:24,199 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+ "2020-09-16 16:42:37,998 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+ "2020-09-16 16:43:03,272 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+ "2020-09-16 16:44:01,301 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+ "2020-09-16 16:44:12,500 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+ "2020-09-16 16:44:32,221 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+ "2020-09-16 16:45:20,148 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n"
]
},
{
@@ -703,26 +591,26 @@
{
"data": {
"text/plain": [
- "{'id': 'd9854a57d48100da0f3584dc4490301f',\n",
- " 'name': 'single_table.classification.mlprimitives.xgboost',\n",
+ "{'id': '52f65be5a78a6c557b8c5bf868bfdb7d',\n",
+ " 'name': 'single_table.classification',\n",
" 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
- " 'max_labels'): 22,\n",
+ " 'max_labels'): 97,\n",
" ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
" 'lowercase'): True,\n",
" ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
" 'binary'): True,\n",
" ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
- " 'max_features'): 3863,\n",
- " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
- " ('xgboost.XGBClassifier#1', 'n_estimators'): 193,\n",
+ " 'max_features'): 270,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'constant',\n",
+ " ('xgboost.XGBClassifier#1', 'n_estimators'): 556,\n",
" ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
- " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.29839198565184866,\n",
- " ('xgboost.XGBClassifier#1', 'gamma'): 0.19826736959824165,\n",
- " ('xgboost.XGBClassifier#1', 'min_child_weight'): 4},\n",
- " 'score': 0.8739290413691612}"
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.4023947989981499,\n",
+ " ('xgboost.XGBClassifier#1', 'gamma'): 0.9595910516937898,\n",
+ " ('xgboost.XGBClassifier#1', 'min_child_weight'): 6},\n",
+ " 'score': 0.8730998313333643}"
]
},
- "execution_count": 15,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -757,32 +645,32 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "{'id': 'd9854a57d48100da0f3584dc4490301f',\n",
- " 'name': 'single_table.classification.mlprimitives.xgboost',\n",
+ "{'id': '52f65be5a78a6c557b8c5bf868bfdb7d',\n",
+ " 'name': 'single_table.classification',\n",
" 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
- " 'max_labels'): 22,\n",
+ " 'max_labels'): 97,\n",
" ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
" 'lowercase'): True,\n",
" ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
" 'binary'): True,\n",
" ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
- " 'max_features'): 3863,\n",
- " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
- " ('xgboost.XGBClassifier#1', 'n_estimators'): 193,\n",
+ " 'max_features'): 270,\n",
+ " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'constant',\n",
+ " ('xgboost.XGBClassifier#1', 'n_estimators'): 556,\n",
" ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
- " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.29839198565184866,\n",
- " ('xgboost.XGBClassifier#1', 'gamma'): 0.19826736959824165,\n",
- " ('xgboost.XGBClassifier#1', 'min_child_weight'): 4},\n",
- " 'score': 0.8739290413691612}"
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.4023947989981499,\n",
+ " ('xgboost.XGBClassifier#1', 'gamma'): 0.9595910516937898,\n",
+ " ('xgboost.XGBClassifier#1', 'min_child_weight'): 6},\n",
+ " 'score': 0.8730998313333643}"
]
},
- "execution_count": 16,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@@ -794,7 +682,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@@ -818,7 +706,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 16,
"metadata": {
"scrolled": false
},
@@ -826,25 +714,36 @@
{
"data": {
"text/plain": [
- "[{'id': '9dd9a11254f46b11ad42a12692b4965e',\n",
- " 'name': 'single_table.classification.categorical_encoder.logit',\n",
+ "[{'id': 'c2cd14c7e9470448a0eeb58a3cce327f',\n",
+ " 'name': 'single_table.classification',\n",
" 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
" 'max_labels'): 0,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'lowercase'): True,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'binary'): True,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'max_features'): 1000,\n",
" ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
- " ('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): True,\n",
- " ('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 100,\n",
- " ('sklearn.linear_model.LogisticRegression#1', 'solver'): 'liblinear',\n",
- " ('sklearn.linear_model.LogisticRegression#1', 'penalty'): 'l2',\n",
- " ('sklearn.linear_model.LogisticRegression#1', 'C'): 1.0,\n",
- " ('sklearn.linear_model.LogisticRegression#1', 'multi_class'): 'ovr',\n",
- " ('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 1.0,\n",
- " ('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.0001,\n",
- " ('sklearn.linear_model.LogisticRegression#1', 'dual'): False},\n",
- " 'score': 0.7975185708718643},\n",
- " {'id': 'f7ef0814341cee4f05280077b9b3de9c',\n",
- " 'name': 'single_table.classification.categorical_encoder.random_forest',\n",
- " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
- " 'max_labels'): 0,\n",
+ " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
+ " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
+ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
+ " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n",
+ " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n",
+ " 'score': 0.8639171383183359},\n",
+ " {'id': 'adbd189a819483ddc869ceb94513b369',\n",
+ " 'name': 'single_table.classification.text',\n",
+ " 'config': {('mlprimitives.custom.text.TextCleaner#1', 'lower'): True,\n",
+ " ('mlprimitives.custom.text.TextCleaner#1', 'accents'): True,\n",
+ " ('mlprimitives.custom.text.TextCleaner#1', 'stopwords'): True,\n",
+ " ('mlprimitives.custom.text.TextCleaner#1', 'non_alpha'): True,\n",
+ " ('mlprimitives.custom.text.TextCleaner#1', 'single_chars'): True,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'lowercase'): True,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'binary'): True,\n",
+ " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+ " 'max_features'): 1000,\n",
" ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
" ('sklearn.ensemble.RandomForestClassifier#1', 'n_estimators'): 10,\n",
" ('sklearn.ensemble.RandomForestClassifier#1', 'criterion'): 'gini',\n",
@@ -858,10 +757,10 @@
" ('sklearn.ensemble.RandomForestClassifier#1', 'min_impurity_decrease'): 0.0,\n",
" ('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True,\n",
" ('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False},\n",
- " 'score': 0.7591904454179904}]"
+ " 'score': None}]"
]
},
- "execution_count": 20,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -887,7 +786,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.6.9"
}
},
"nbformat": 4,
diff --git a/setup.py b/setup.py
index 945385da..b1aafccb 100644
--- a/setup.py
+++ b/setup.py
@@ -20,15 +20,20 @@
examples_require = [
- 'mlprimitives>=0.2.4.dev0',
- 'jupyter==1.0.0'
+ 'matplotlib>=2.2.2,<3.2.2',
+ 'mlprimitives>=0.2.5,<0.3',
+ 'boto3>=1.14,<1.14.45',
+ 'botocore<1.17.45,>=1.17.44',
+ 'jupyter==1.0.0',
+ 'docutils<0.16,>=0.10',
+ 'baytune>=0.3.0,<0.4',
]
tests_require = [
'pytest>=3.4.2',
'pytest-cov>=2.6.0',
- 'mlprimitives>=0.2.4.dev0',
+ 'mlprimitives>=0.2,<0.3',
'setuptools>=41.0.0',
'numpy<1.17',
'rundoc>=0.4.3',
@@ -43,34 +48,32 @@
development_requires = [
# general
- 'bumpversion>=0.5.3',
+ 'bumpversion>=0.5.3,<0.6',
'pip>=9.0.1',
- 'watchdog>=0.8.3',
+ 'watchdog>=0.8.3,<0.11',
# docs
'm2r>=0.2.0,<0.3',
'Sphinx>=1.7.1,<3',
- 'sphinx_rtd_theme>=0.2.4',
+ 'sphinx_rtd_theme>=0.2.4,<0.5',
'ipython>=6.5.0',
- 'matplotlib>=2.2.3',
'autodocsumm>=0.1.10',
- 'docutils<0.15,>=0.10', # botocore incompatibility with 0.15
# style check
- 'flake8>=3.5.0,<3.8',
+ 'flake8>=3.7.7,<4',
'isort>=4.3.4,<5',
# fix style issues
- 'autoflake>=1.2', # keep this after flake8 to avoid
- 'autopep8>=1.3.5', # version incompatibilities with flake8
+ 'autoflake>=1.1,<2',
+ 'autopep8>=1.4.3,<2',
# distribute on PyPI
- 'twine>=1.10.0',
+ 'twine>=1.10.0,<4',
'wheel>=0.30.0',
# Advanced testing
- 'tox>=2.9.1',
- 'coverage>=4.5.1',
+ 'coverage>=4.5.1,<6',
+ 'tox>=2.9.1,<4',
# Documentation style
'doc8>=0.8.0',
@@ -93,7 +96,7 @@
description="Pipelines and primitives for machine learning and data science.",
extras_require={
'dev': development_requires + tests_require + examples_require,
- 'test': tests_require,
+ 'test': tests_require + examples_require,
'examples': examples_require,
},
include_package_data=True,
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index ffdd8deb..59e11633 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -879,6 +879,7 @@ def test_predict_debug(self):
for block_name, dictionary in expected_return.items():
assert set(debug_returned[block_name].keys()) == set(dictionary.keys())
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_get_diagram_simple(self):
f = open('tests/data/diagrams/diagram_simple.txt', 'r')
expected = f.read()[:-1]
@@ -984,4 +985,4 @@ def test_from_dict(self):
pass
def test_load(self):
- pass
\ No newline at end of file
+ pass
diff --git a/tox.ini b/tox.ini
index 1b8a777e..96d29dbe 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,37 +1,20 @@
[tox]
-envlist = py35, py36, lint, docs, readme
-
+envlist = py3{5,6}, test-devel
[travis]
python =
- 3.6: py36, lint, docs
+ 3.6: py36, test-devel
3.5: py35
-
[testenv]
passenv = CI TRAVIS TRAVIS_*
-setenv =
- PYTHONPATH = {toxinidir}
+skipsdist = false
+skip_install = false
extras = test
commands =
/usr/bin/env make test
-
-[testenv:lint]
-skipsdist = true
-extras = dev
-commands =
- /usr/bin/env make lint
-
-
-[testenv:docs]
-skipsdist = true
+[testenv:test-devel]
extras = dev
commands =
- /usr/bin/env make docs
-
-
-[testenv:readme]
-skipsdist = true
-commands =
- /usr/bin/env make test-readme
+ /usr/bin/env make test-devel
From 6ac5731d69c71533499fbac8ac90932289ebc1c7 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 16 Sep 2020 18:10:55 +0200
Subject: [PATCH 116/160] Add travis wait
---
.travis.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.travis.yml b/.travis.yml
index 7c63a880..97f4bcf8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,7 +12,7 @@ install:
- pip install -U tox-travis codecov
# Command to run tests
-script: tox
+script: travis_wait tox
after_success: codecov
From 0fac3ce2cc2f4d4982982c52e63d1e9198a91896 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Wed, 16 Sep 2020 20:07:57 +0200
Subject: [PATCH 117/160] travis wait 60 min
---
.travis.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.travis.yml b/.travis.yml
index 97f4bcf8..51ac1dd8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,7 +12,7 @@ install:
- pip install -U tox-travis codecov
# Command to run tests
-script: travis_wait tox
+script: travis_wait 60 tox
after_success: codecov
From f6bff86bb061a85789981bfbf0a0366c6cab7f95 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 19 Nov 2020 12:47:35 +0100
Subject: [PATCH 118/160] Remove unused datasets module
---
mlblocks/datasets.py | 447 -----------------------------------------
tests/test_datasets.py | 58 ------
2 files changed, 505 deletions(-)
delete mode 100644 mlblocks/datasets.py
delete mode 100644 tests/test_datasets.py
diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py
deleted file mode 100644
index 0c69afda..00000000
--- a/mlblocks/datasets.py
+++ /dev/null
@@ -1,447 +0,0 @@
-# -*- coding: utf-8 -*-
-
-"""
-Datasets module.
-
-This module contains functions that allow loading datasets for easy
-testing of pipelines and primitives over multiple data modalities
-and task types.
-
-The available datasets by data modality and task type are:
-
-+---------------+---------------+-------------------------+
-| Dataset | Data Modality | Task Type |
-+===============+===============+=========================+
-| Amazon | Graph | Community Detection |
-+---------------+---------------+-------------------------+
-| DIC28 | Graph | Graph Matching |
-+---------------+---------------+-------------------------+
-| UMLs | Graph | Link Prediction |
-+---------------+---------------+-------------------------+
-| Nomination | Graph | Vertex Nomination |
-+---------------+---------------+-------------------------+
-| USPS | Image | Classification |
-+---------------+---------------+-------------------------+
-| Hand Geometry | Image | Regression |
-+---------------+---------------+-------------------------+
-| Iris | Single Table | Classification |
-+---------------+---------------+-------------------------+
-| Jester | Single Table | Collaborative Filtering |
-+---------------+---------------+-------------------------+
-| Boston | Single Table | Regression |
-+---------------+---------------+-------------------------+
-| Wiki QA | Multi Table | Classification |
-+---------------+---------------+-------------------------+
-| Personae | Text | Classification |
-+---------------+---------------+-------------------------+
-| News Groups | Text | Classification |
-+---------------+---------------+-------------------------+
-
-"""
-
-import io
-import logging
-import os
-import tarfile
-import urllib
-
-import networkx as nx
-import numpy as np
-import pandas as pd
-from keras.preprocessing.image import img_to_array, load_img
-from sklearn import datasets
-from sklearn.metrics import accuracy_score, normalized_mutual_info_score, r2_score
-from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
-
-LOGGER = logging.getLogger(__name__)
-
-INPUT_SHAPE = [224, 224, 3]
-
-DATA_PATH = os.path.join(
- os.path.dirname(__file__),
- 'data'
-)
-DATA_URL = 'http://dai-mlblocks.s3.amazonaws.com/%7B%7D.tar.gz'
-
-
-class Dataset():
- """Dataset class.
-
- This class represents the abstraction of a dataset and works as
- a container of all the things needed in order to use a dataset
- for testing.
-
- Among other things, it includes the actual dataset data, information
- about its origin, a score function that works for this dataset,
- and a method to split the data in multiple ways for goodnes-of-fit
- evaluation.
-
- Attributes:
- name (str): Name of this dataset.
- description (str): Short description about the data that composes this dataset.
- data (array-like): Numpy array or pandas DataFrame containing all the data of
- this dataset, excluding the labels or target values.
- target (array-like): Numpy array or pandas Series containing the expected labels
- or values
- **kwargs: Any additional keyword argument passed on initailization is also
- available as instance attributes.
-
- Args:
- description (str): Short description about the data that composes this dataset.
- The first line of the description is expected to be a human friendly
- name for the dataset, and will be set as the `name` attribute.
- data (array-like): Numpy array or pandas DataFrame containing all the data of
- this dataset, excluding the labels or target values.
- target (array-like): Numpy array or pandas Series containing the expected labels
- or values
- score (callable): Function that will be used to compute the score of this dataset.
- shuffle (bool): Whether or not to shuffle the data before splitting.
- stratify (bool): Whther to use a stratified or regular KFold for splitting.
- **kwargs: Any additional keyword argument passed on initialization will be made
- available as instance attributes.
- """
-
- def __init__(self, description, data, target, score, shuffle=True, stratify=False, **kwargs):
-
- self.name = description.splitlines()[0]
- self.description = description
-
- self.data = data
- self.target = target
-
- self._stratify = stratify
- self._shuffle = shuffle
- self._score = score
-
- self.__dict__.update(kwargs)
-
- def score(self, *args, **kwargs):
- r"""Scoring function for this dataset.
-
- Args:
- \*args, \*\*kwargs: Any given arguments and keyword arguments will be
- directly passed to the given scoring function.
-
- Returns:
- float:
- The computed score.
- """
- return self._score(*args, **kwargs)
-
- def __repr__(self):
- return self.name
-
- def describe(self):
- """Print the description of this Dataset on stdout."""
- print(self.description)
-
- @staticmethod
- def _get_split(data, index):
- if hasattr(data, 'iloc'):
- return data.iloc[index]
- else:
- return data[index]
-
- def get_splits(self, n_splits=1, random_state=0):
- """Return splits of this dataset ready for Cross Validation.
-
- If n_splits is 1, a tuple containing the X for train and test
- and the y for train and test is returned.
- Otherwise, if n_splits is bigger than 1, a list of such tuples
- is returned, one for each split.
-
- Args:
- n_splits (int): Number of times that the data needs to be splitted.
-
- Returns:
- tuple or list:
- if n_splits is 1, a tuple containing the X for train and test
- and the y for train and test is returned.
- Otherwise, if n_splits is bigger than 1, a list of such tuples
- is returned, one for each split.
- """
- if n_splits == 1:
- stratify = self.target if self._stratify else None
-
- return train_test_split(
- self.data,
- self.target,
- shuffle=self._shuffle,
- stratify=stratify,
- random_state=random_state
- )
-
- else:
- cv_class = StratifiedKFold if self._stratify else KFold
- cv = cv_class(n_splits=n_splits, shuffle=self._shuffle, random_state=random_state)
-
- splits = list()
- for train, test in cv.split(self.data, self.target):
- X_train = self._get_split(self.data, train)
- y_train = self._get_split(self.target, train)
- X_test = self._get_split(self.data, test)
- y_test = self._get_split(self.target, test)
- splits.append((X_train, X_test, y_train, y_test))
-
- return splits
-
-
-def _download(dataset_name, dataset_path):
- url = DATA_URL.format(dataset_name)
-
- LOGGER.debug('Downloading dataset %s from %s', dataset_name, url)
- response = urllib.request.urlopen(url)
- bytes_io = io.BytesIO(response.read())
-
- LOGGER.debug('Extracting dataset into %s', DATA_PATH)
- with tarfile.open(fileobj=bytes_io, mode='r:gz') as tf:
- tf.extractall(DATA_PATH)
-
-
-def _load(dataset_name):
- if not os.path.exists(DATA_PATH):
- os.makedirs(DATA_PATH)
-
- dataset_path = os.path.join(DATA_PATH, dataset_name)
- if not os.path.exists(dataset_path):
- _download(dataset_name, dataset_path)
-
- return dataset_path
-
-
-def _load_images(image_dir, filenames):
- LOGGER.debug('Loading %s images from %s', len(filenames), image_dir)
- images = []
- for filename in filenames:
- filename = os.path.join(image_dir, filename)
-
- image = load_img(filename)
- image = image.resize(tuple(INPUT_SHAPE[0:2]))
- image = img_to_array(image)
- image = image / 255.0 # Quantize images.
- images.append(image)
-
- return np.array(images)
-
-
-def _load_csv(dataset_path, name, set_index=False):
- csv_path = os.path.join(dataset_path, name + '.csv')
-
- LOGGER.debug('Loading csv %s', csv_path)
- df = pd.read_csv(csv_path)
-
- if set_index:
- df = df.set_index(df.columns[0], drop=False)
-
- return df
-
-
-def load_usps():
- """USPs Digits Dataset.
-
- The data of this dataset is a 3d numpy array vector with shape (224, 224, 3)
- containing 9298 224x224 RGB photos of handwritten digits, and the target is
- a 1d numpy integer array containing the label of the digit represented in
- the image.
- """
- dataset_path = _load('usps')
-
- df = _load_csv(dataset_path, 'data')
- X = _load_images(os.path.join(dataset_path, 'images'), df.image)
- y = df.label.values
-
- return Dataset(load_usps.__doc__, X, y, accuracy_score, stratify=True)
-
-
-def load_handgeometry():
- """Hand Geometry Dataset.
-
- The data of this dataset is a 3d numpy array vector with shape (224, 224, 3)
- containing 112 224x224 RGB photos of hands, and the target is a 1d numpy
- float array containing the width of the wrist in centimeters.
- """
- dataset_path = _load('handgeometry')
-
- df = _load_csv(dataset_path, 'data')
- X = _load_images(os.path.join(dataset_path, 'images'), df.image)
- y = df.target.values
-
- return Dataset(load_handgeometry.__doc__, X, y, r2_score)
-
-
-def load_personae():
- """Personae Dataset.
-
- The data of this dataset is a 2d numpy array vector containing 145 entries
- that include texts written by Dutch users in Twitter, with some additional
- information about the author, and the target is a 1d numpy binary integer
- array indicating whether the author was extrovert or not.
- """
- dataset_path = _load('personae')
-
- X = _load_csv(dataset_path, 'data')
- y = X.pop('label').values
-
- return Dataset(load_personae.__doc__, X, y, accuracy_score, stratify=True)
-
-
-def load_umls():
- """UMLs Dataset.
-
- The data consists of information about a 135 Graph and the relations between
- their nodes given as a DataFrame with three columns, source, target and type,
- indicating which nodes are related and with which type of link. The target is
- a 1d numpy binary integer array indicating whether the indicated link exists
- or not.
- """
- dataset_path = _load('umls')
-
- X = _load_csv(dataset_path, 'data')
- y = X.pop('label').values
-
- graph = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph.gml')))
-
- return Dataset(load_umls.__doc__, X, y, accuracy_score, stratify=True, graph=graph)
-
-
-def load_dic28():
- """DIC28 Dataset from Pajek.
-
- This network represents connections among English words in a dictionary.
- It was generated from Knuth's dictionary. Two words are connected by an
- edge if we can reach one from the other by
- - changing a single character (e. g., work - word)
- - adding / removing a single character (e. g., ever - fever).
-
- There exist 52,652 words (vertices in a network) having 2 up to 8 characters
- in the dictionary. The obtained network has 89038 edges.
- """
- dataset_path = _load('dic28')
-
- X = _load_csv(dataset_path, 'data')
- y = X.pop('label').values
-
- graph1 = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph1.gml')))
- graph2 = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph2.gml')))
-
- graph = graph1.copy()
- graph.add_nodes_from(graph2.nodes(data=True))
- graph.add_edges_from(graph2.edges)
- graph.add_edges_from(X[['graph1', 'graph2']].values)
-
- graphs = {
- 'graph1': graph1,
- 'graph2': graph2,
- }
-
- return Dataset(load_dic28.__doc__, X, y, accuracy_score,
- stratify=True, graph=graph, graphs=graphs)
-
-
-def load_nomination():
- """Sample 1 of graph vertex nomination data from MII Lincoln Lab.
-
- Data consists of one graph whose nodes contain two attributes, attr1 and attr2.
- Associated with each node is a label that has to be learned and predicted.
- """
- dataset_path = _load('nomination')
-
- X = _load_csv(dataset_path, 'data')
- y = X.pop('label').values
-
- graph = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph.gml')))
-
- return Dataset(load_nomination.__doc__, X, y, accuracy_score, stratify=True, graph=graph)
-
-
-def load_amazon():
- """Amazon product co-purchasing network and ground-truth communities.
-
- Network was collected by crawling Amazon website. It is based on Customers Who Bought
- This Item Also Bought feature of the Amazon website. If a product i is frequently
- co-purchased with product j, the graph contains an undirected edge from i to j.
- Each product category provided by Amazon defines each ground-truth community.
- """
- dataset_path = _load('amazon')
-
- X = _load_csv(dataset_path, 'data')
- y = X.pop('label').values
-
- graph = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph.gml')))
-
- return Dataset(load_amazon.__doc__, X, y, normalized_mutual_info_score, graph=graph)
-
-
-def load_jester():
- """Ratings from the Jester Online Joke Recommender System.
-
- This dataset consists of over 1.7 million instances of (user_id, item_id, rating)
- triples, which is split 50-50 into train and test data.
-
- source: "University of California Berkeley, CA"
- sourceURI: "http://eigentaste.berkeley.edu/dataset/"
- """
- dataset_path = _load('jester')
-
- X = _load_csv(dataset_path, 'data')
- y = X.pop('rating').values
-
- return Dataset(load_jester.__doc__, X, y, r2_score)
-
-
-def load_wikiqa():
- """Challenge Dataset for Open-Domain Question Answering.
-
- WikiQA dataset is a publicly available set of question and sentence (QS) pairs,
- collected and annotated for research on open-domain question answering.
-
- source: "Microsoft"
- sourceURI: "https://www.microsoft.com/en-us/research/publication/wikiqa-a-challenge-dataset-for-open-domain-question-answering/#"
- """ # noqa
- dataset_path = _load('wikiqa')
-
- data = _load_csv(dataset_path, 'data', set_index=True)
- questions = _load_csv(dataset_path, 'questions', set_index=True)
- sentences = _load_csv(dataset_path, 'sentences', set_index=True)
- vocabulary = _load_csv(dataset_path, 'vocabulary', set_index=True)
-
- entities = {
- 'data': (data, 'd3mIndex', None),
- 'questions': (questions, 'qIndex', None),
- 'sentences': (sentences, 'sIndex', None),
- 'vocabulary': (vocabulary, 'index', None)
- }
- relationships = [
- ('questions', 'qIndex', 'data', 'qIndex'),
- ('sentences', 'sIndex', 'data', 'sIndex')
- ]
-
- target = data.pop('isAnswer').values
-
- return Dataset(load_wikiqa.__doc__, data, target, accuracy_score, startify=True,
- entities=entities, relationships=relationships)
-
-
-def load_newsgroups():
- """20 News Groups Dataset.
-
- The data of this dataset is a 1d numpy array vector containing the texts
- from 11314 newsgroups posts, and the target is a 1d numpy integer array
- containing the label of one of the 20 topics that they are about.
- """
- dataset = datasets.fetch_20newsgroups()
- return Dataset(load_newsgroups.__doc__, np.array(dataset.data), dataset.target,
- accuracy_score, stratify=True)
-
-
-def load_iris():
- """Iris Dataset."""
- dataset = datasets.load_iris()
- return Dataset(load_iris.__doc__, dataset.data, dataset.target,
- accuracy_score, stratify=True)
-
-
-def load_boston():
- """Boston House Prices Dataset."""
- dataset = datasets.load_boston()
- return Dataset(load_boston.__doc__, dataset.data, dataset.target, r2_score)
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
deleted file mode 100644
index 174a85d6..00000000
--- a/tests/test_datasets.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from unittest import TestCase
-from unittest.mock import Mock
-
-from mlblocks import datasets
-
-
-class TestDataset(TestCase):
-
- def setUp(self):
- self.description = """Dataset Name.
-
- Some extended description.
- """
- self.score = Mock()
- self.score.return_value = 1.0
-
- self.dataset = datasets.Dataset(
- self.description, 'data', 'target', self.score,
- shuffle=False, stratify=True, some='kwargs')
-
- def test___init__(self):
-
- assert self.dataset.name == 'Dataset Name.'
- assert self.dataset.description == self.description
- assert self.dataset.data == 'data'
- assert self.dataset.target == 'target'
- assert self.dataset._shuffle is False
- assert self.dataset._stratify is True
- assert self.dataset._score == self.score
- assert self.dataset.some == 'kwargs'
-
- def test_score(self):
- returned = self.dataset.score('a', b='c')
-
- assert returned == 1.0
- self.score.assert_called_once_with('a', b='c')
-
- def test___repr__(self):
- repr_ = str(self.dataset)
-
- assert repr_ == "Dataset Name."
-
-
-def test_dataset_describe(capsys):
- """Tested here because fixtures are not supported in TestCases."""
-
- description = """Dataset Name.
-
- Some extended description.
- """
-
- dataset = datasets.Dataset(description, 'data', 'target', 'score')
- dataset.describe()
-
- captured = capsys.readouterr()
- assert captured.out == description + '\n'
From 7afbe40e6006ab5c228df7e8f9ae3e3cc3ab1ce5 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Thu, 19 Nov 2020 12:48:42 +0100
Subject: [PATCH 119/160] Update python support and dependency ranges
---
.travis.yml | 5 +++--
README.md | 2 +-
setup.py | 10 ++++++----
tox.ini | 7 ++++---
4 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index 51ac1dd8..d2a982f2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,8 +2,9 @@
dist: bionic
language: python
python:
+ - 3.8
+ - 3.7
- 3.6
- - 3.5
# Command to install dependencies
install:
@@ -26,4 +27,4 @@ deploy:
target-branch: gh-pages
on:
branch: master
- python: 3.6
+ python: 3.8
diff --git a/README.md b/README.md
index fa4260d5..127089ac 100644
--- a/README.md
+++ b/README.md
@@ -47,7 +47,7 @@ Features include:
## Requirements
-**MLBlocks** has been developed and tested on [Python 3.5 and 3.6](https://www.python.org/downloads/)
+**MLBlocks** has been developed and tested on [Python 3.6, 3.7 and 3.8](https://www.python.org/downloads/)
Also, although it is not strictly required, the usage of a
[virtualenv](https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid
diff --git a/setup.py b/setup.py
index b1aafccb..b07eccb6 100644
--- a/setup.py
+++ b/setup.py
@@ -16,12 +16,13 @@
install_requires = [
'graphviz>=0.9,<1',
+ 'numpy>=1.17.1,<1.19',
]
examples_require = [
'matplotlib>=2.2.2,<3.2.2',
- 'mlprimitives>=0.2.5,<0.3',
+ 'mlprimitives>=0.2.6.dev0,<0.3',
'boto3>=1.14,<1.14.45',
'botocore<1.17.45,>=1.17.44',
'jupyter==1.0.0',
@@ -33,9 +34,8 @@
tests_require = [
'pytest>=3.4.2',
'pytest-cov>=2.6.0',
- 'mlprimitives>=0.2,<0.3',
+ 'mlprimitives>=0.2.6.dev0,<0.3',
'setuptools>=41.0.0',
- 'numpy<1.17',
'rundoc>=0.4.3',
'prompt-toolkit>=2.0,<3.0',
]
@@ -90,8 +90,9 @@
'License :: OSI Approved :: MIT License',
'Natural Language :: English',
'Programming Language :: Python :: 3',
- 'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
+ 'Programming Language :: Python :: 3.7',
+ 'Programming Language :: Python :: 3.8',
],
description="Pipelines and primitives for machine learning and data science.",
extras_require={
@@ -107,6 +108,7 @@
long_description_content_type='text/markdown',
name='mlblocks',
packages=find_packages(include=['mlblocks', 'mlblocks.*']),
+ python_requires='>=3.6,<3.9',
setup_requires=setup_requires,
test_suite='tests',
tests_require=tests_require,
diff --git a/tox.ini b/tox.ini
index 96d29dbe..1bc3f81a 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,10 +1,11 @@
[tox]
-envlist = py3{5,6}, test-devel
+envlist = py3{6,7,8}, test-devel
[travis]
python =
- 3.6: py36, test-devel
- 3.5: py35
+ 3.8: py38, test-devel
+ 3.7: py37
+ 3.6: py36
[testenv]
passenv = CI TRAVIS TRAVIS_*
From cf419bd64b2f90aafd0e56df25325103c646e45e Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sun, 13 Dec 2020 18:16:46 +0100
Subject: [PATCH 120/160] Update baytune dependency
---
setup.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/setup.py b/setup.py
index b07eccb6..6ae0c75e 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@
'botocore<1.17.45,>=1.17.44',
'jupyter==1.0.0',
'docutils<0.16,>=0.10',
- 'baytune>=0.3.0,<0.4',
+ 'baytune>=0.3.13.dev0,<0.4',
]
From 68774a040ee489ee4abbb1e73acfe30ab556bfb2 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sun, 13 Dec 2020 18:31:08 +0100
Subject: [PATCH 121/160] Change links to mlbazaar
---
CONTRIBUTING.rst | 8 +++---
HISTORY.md | 26 +++++++++----------
README.md | 22 ++++++++--------
docs/advanced_usage/adding_primitives.rst | 6 ++---
docs/advanced_usage/hyperparameters.rst | 4 +--
docs/advanced_usage/primitives.rst | 6 ++---
docs/conf.py | 2 +-
docs/getting_started/install.rst | 2 +-
docs/getting_started/quickstart.rst | 2 +-
docs/index.rst | 6 ++---
docs/pipeline_examples/graph.rst | 2 +-
docs/pipeline_examples/image.rst | 2 +-
docs/pipeline_examples/multi_table.rst | 2 +-
docs/pipeline_examples/text.rst | 6 ++---
examples/README.md | 6 ++---
.... Setting MLPipeline Hyperparameters.ipynb | 2 +-
examples/tutorials/7. Tuning a Pipeline.ipynb | 2 +-
...or the best pipeline with BTBSession.ipynb | 2 +-
mlblocks/__init__.py | 2 +-
setup.py | 2 +-
20 files changed, 56 insertions(+), 56 deletions(-)
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index 4c01093e..43acf3a0 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -15,7 +15,7 @@ Types of Contributions
Report Bugs
~~~~~~~~~~~
-Report bugs at https://github.com/HDI-Project/MLBlocks/issues.
+Report bugs at https://github.com/MLBazaar/MLBlocks/issues.
If you are reporting a bug, please include:
@@ -45,7 +45,7 @@ articles, and such.
Submit Feedback
~~~~~~~~~~~~~~~
-The best way to send feedback is to file an issue at https://github.com/HDI-Project/MLBlocks/issues.
+The best way to send feedback is to file an issue at https://github.com/MLBazaar/MLBlocks/issues.
If you are proposing a feature:
@@ -120,8 +120,8 @@ Before you submit a pull request, check that it meets these guidelines:
4. If the pull request adds functionality, the docs should be updated. Put
your new functionality into a function with a docstring, and add the
feature to the list in README.rst.
-5. The pull request should work for Python2.7, 3.4, 3.5 and 3.6. Check
- https://travis-ci.org/HDI-Project/MLBlocks/pull_requests
+5. The pull request should work for all the supported python version. Check
+ https://travis-ci.org/MLBazaar/MLBlocks/pull_requests
and make sure that all the checks pass.
Unit Testing Guidelines
diff --git a/HISTORY.md b/HISTORY.md
index 5b5d4f0b..17bbda92 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -4,31 +4,31 @@ Changelog
0.3.4 - 2019-11-01
------------------
-* Ability to return intermediate context - [Issue #110](https://github.com/HDI-Project/MLBlocks/issues/110) by @csala
-* Support for static or class methods - [Issue #107](https://github.com/HDI-Project/MLBlocks/issues/107) by @csala
+* Ability to return intermediate context - [Issue #110](https://github.com/MLBazaar/MLBlocks/issues/110) by @csala
+* Support for static or class methods - [Issue #107](https://github.com/MLBazaar/MLBlocks/issues/107) by @csala
0.3.3 - 2019-09-09
------------------
-* Improved intermediate outputs management - [Issue #105](https://github.com/HDI-Project/MLBlocks/issues/105) by @csala
+* Improved intermediate outputs management - [Issue #105](https://github.com/MLBazaar/MLBlocks/issues/105) by @csala
0.3.2 - 2019-08-12
------------------
-* Allow passing fit and produce arguments as `init_params` - [Issue #96](https://github.com/HDI-Project/MLBlocks/issues/96) by @csala
-* Support optional fit and produce args and arg defaults - [Issue #95](https://github.com/HDI-Project/MLBlocks/issues/95) by @csala
-* Isolate primitives from their hyperparameters dictionary - [Issue #94](https://github.com/HDI-Project/MLBlocks/issues/94) by @csala
-* Add functions to explore the available primitives and pipelines - [Issue #90](https://github.com/HDI-Project/MLBlocks/issues/90) by @csala
-* Add primitive caching - [Issue #22](https://github.com/HDI-Project/MLBlocks/issues/22) by @csala
+* Allow passing fit and produce arguments as `init_params` - [Issue #96](https://github.com/MLBazaar/MLBlocks/issues/96) by @csala
+* Support optional fit and produce args and arg defaults - [Issue #95](https://github.com/MLBazaar/MLBlocks/issues/95) by @csala
+* Isolate primitives from their hyperparameters dictionary - [Issue #94](https://github.com/MLBazaar/MLBlocks/issues/94) by @csala
+* Add functions to explore the available primitives and pipelines - [Issue #90](https://github.com/MLBazaar/MLBlocks/issues/90) by @csala
+* Add primitive caching - [Issue #22](https://github.com/MLBazaar/MLBlocks/issues/22) by @csala
0.3.1 - Pipelines Discovery
---------------------------
-* Support flat hyperparameter dictionaries - [Issue #92](https://github.com/HDI-Project/MLBlocks/issues/92) by @csala
-* Load pipelines by name and register them as `entry_points` - [Issue #88](https://github.com/HDI-Project/MLBlocks/issues/88) by @csala
-* Implement partial re-fit -[Issue #61](https://github.com/HDI-Project/MLBlocks/issues/61) by @csala
-* Move argument parsing to MLBlock - [Issue #86](https://github.com/HDI-Project/MLBlocks/issues/86) by @csala
-* Allow getting intermediate outputs - [Issue #58](https://github.com/HDI-Project/MLBlocks/issues/58) by @csala
+* Support flat hyperparameter dictionaries - [Issue #92](https://github.com/MLBazaar/MLBlocks/issues/92) by @csala
+* Load pipelines by name and register them as `entry_points` - [Issue #88](https://github.com/MLBazaar/MLBlocks/issues/88) by @csala
+* Implement partial re-fit -[Issue #61](https://github.com/MLBazaar/MLBlocks/issues/61) by @csala
+* Move argument parsing to MLBlock - [Issue #86](https://github.com/MLBazaar/MLBlocks/issues/86) by @csala
+* Allow getting intermediate outputs - [Issue #58](https://github.com/MLBazaar/MLBlocks/issues/58) by @csala
0.3.0 - New Primitives Discovery
--------------------------------
diff --git a/README.md b/README.md
index 127089ac..770f34ef 100644
--- a/README.md
+++ b/README.md
@@ -13,18 +13,18 @@ Pipelines and Primitives for Machine Learning and Data Science.
[](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
[](https://pypi.python.org/pypi/mlblocks)
-[](https://travis-ci.org/HDI-Project/MLBlocks)
-[](https://codecov.io/gh/HDI-Project/MLBlocks)
+[](https://travis-ci.org/MLBazaar/MLBlocks)
+[](https://codecov.io/gh/MLBazaar/MLBlocks)
[](https://pepy.tech/project/mlblocks)
# MLBlocks
-* Free software: [MIT license](https://github.com/HDI-Project/MLBlocks/blob/master/LICENSE)
+* Free software: [MIT license](https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE)
* Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
-* Documentation: https://HDI-Project.github.io/MLBlocks
-* Homepage: https://github.com/HDI-Project/MLBlocks
+* Documentation: https://mlbazaar.github.io/MLBlocks
+* Homepage: https://github.com/MLBazaar/MLBlocks
## Overview
@@ -38,7 +38,7 @@ Features include:
no python code to write, carefully curated by Machine Learning and Domain experts.
* Extract machine-readable information about which hyperparameters can be tuned and within
which ranges, allowing automated integration with Hyperparameter Optimization tools like
- [BTB](https://github.com/HDI-Project/BTB).
+ [BTB](https://github.com/MLBazaar/BTB).
* Complex multi-branch pipelines and DAG configurations, with unlimited number of inputs and
outputs per primitive.
* Easy save and load Pipelines using JSON Annotations.
@@ -65,14 +65,14 @@ pip install mlblocks
This will pull and install the latest stable release from [PyPi](https://pypi.org/).
If you want to install from source or contribute to the project please read the
-[Contributing Guide](https://hdi-project.github.io/MLBlocks/contributing.html#get-started).
+[Contributing Guide](https://mlbazaar.github.io/MLBlocks/contributing.html#get-started).
## MLPrimitives
In order to be usable, MLBlocks requires a compatible primitives library.
The official library, required in order to follow the following MLBlocks tutorial,
-is [MLPrimitives](https://github.com/HDI-Project/MLPrimitives), which you can install
+is [MLPrimitives](https://github.com/MLBazaar/MLPrimitives), which you can install
with this command:
```bash
@@ -83,7 +83,7 @@ pip install mlprimitives
Below there is a short example about how to use **MLBlocks** to solve the [Adult Census
Dataset](https://archive.ics.uci.edu/ml/datasets/Adult) classification problem using a
-pipeline which combines primitives from [MLPrimitives](https://github.com/HDI-Project/MLPrimitives),
+pipeline which combines primitives from [MLPrimitives](https://github.com/MLBazaar/MLPrimitives),
[scikit-learn](https://scikit-learn.org/) and [xgboost](https://xgboost.readthedocs.io/).
```python3
@@ -112,10 +112,10 @@ dataset.score(y_test, predictions)
If you want to learn more about how to tune the pipeline hyperparameters, save and load
the pipelines using JSON annotations or build complex multi-branched pipelines, please
-check our [documentation site](https://HDI-Project.github.io/MLBlocks).
+check our [documentation site](https://mlbazaar.github.io/MLBlocks).
Also do not forget to have a look at the [notebook tutorials](
-https://github.com/HDI-Project/MLBlocks/tree/master/examples/tutorials)!
+https://github.com/MLBazaar/MLBlocks/tree/master/examples/tutorials)!
# Citing MLBlocks
diff --git a/docs/advanced_usage/adding_primitives.rst b/docs/advanced_usage/adding_primitives.rst
index 9d358629..5ad0b60b 100644
--- a/docs/advanced_usage/adding_primitives.rst
+++ b/docs/advanced_usage/adding_primitives.rst
@@ -17,8 +17,8 @@ This can be achieved by running the commands::
For further details, please refer to the `MLPrimitives Documentation`_.
-.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives
-.. _MLPrimitives Documentation: https://hdi-project.github.io/MLPrimitives/
+.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives
+.. _MLPrimitives Documentation: https://mlbazaar.github.io/MLPrimitives/
Writing Primitives
------------------
@@ -27,7 +27,7 @@ Sometimes you will find that you want to use a primitive that is not in the list
`MLPrimitives integrated primitives`_, so you will have to integrate the primitive yourself
by writing the corresponding `JSON annotation `_.
-.. _MLPrimitives integrated primitives: https://github.com/HDI-Project/MLPrimitives/tree/master/mlblocks_primitives
+.. _MLPrimitives integrated primitives: https://github.com/MLBazaar/MLPrimitives/tree/master/mlblocks_primitives
.. note:: If you create new primitives for MLBlocks, please consider contributing them to the
**MLPrimitives** project!
diff --git a/docs/advanced_usage/hyperparameters.rst b/docs/advanced_usage/hyperparameters.rst
index 71686ac5..488be9a9 100644
--- a/docs/advanced_usage/hyperparameters.rst
+++ b/docs/advanced_usage/hyperparameters.rst
@@ -221,8 +221,8 @@ In this case, the hyperparameters would be annotated like this::
of type, range and default value as a nested dictionary to be used by default.
.. _JSON Annotations: primitives.html#json-annotations
-.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives
-.. _BTB: https://github.com/HDI-Project/BTB
+.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives
+.. _BTB: https://github.com/MLBazaar/BTB
.. _MLPipeline: ../api_reference.html#mlblocks.MLPipeline
.. _multitype: #multitype-hyperparameters
.. _conditional: #conditional-hyperparameters
diff --git a/docs/advanced_usage/primitives.rst b/docs/advanced_usage/primitives.rst
index 58847bbe..37df9031 100644
--- a/docs/advanced_usage/primitives.rst
+++ b/docs/advanced_usage/primitives.rst
@@ -311,11 +311,11 @@ For a more detailed description of this class, please check the corresponding
section in the `API Reference`_ documentation.
.. _API Reference: ../api_reference.html
-.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives
-.. _keras.preprocessing.text.Tokenizer: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/keras.preprocessing.text.Tokenizer.json
+.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives
+.. _keras.preprocessing.text.Tokenizer: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.preprocessing.text.Tokenizer.json
.. _hyperparameters: hyperparameters.html
.. _mlblocks.MLBlock: ../api_reference.html#mlblocks.MLBlock
.. _pipelines: pipelines.html
-.. _examples folder: https://github.com/HDI-Project/MLBlocks/tree/master/examples
+.. _examples folder: https://github.com/MLBazaar/MLBlocks/tree/master/examples
.. _fit: ../api_reference.html#mlblocks.MLBlock.fit
.. _produce: ../api_reference.html#mlblocks.MLBlock.produce
diff --git a/docs/conf.py b/docs/conf.py
index 5ff266d0..f81b7b7e 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -63,7 +63,7 @@
copyright = '2018, MIT Data To AI Lab'
author = 'MIT Data To AI Lab'
description = 'Pipelines and Primitives for Machine Learning and Data Science.'
-user = 'HDI-Project'
+user = 'MLBazaar'
# The version info for the project you're documenting, acts as replacement
# for |version| and |release|, also used in various other places throughout
diff --git a/docs/getting_started/install.rst b/docs/getting_started/install.rst
index d2bda921..d64970a2 100644
--- a/docs/getting_started/install.rst
+++ b/docs/getting_started/install.rst
@@ -30,7 +30,7 @@ is `MLPrimitives`_, which you can install with this command:
pip install mlprimitives
-.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives
+.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives
Install for development
-----------------------
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
index b55223dd..386752dc 100644
--- a/docs/getting_started/quickstart.rst
+++ b/docs/getting_started/quickstart.rst
@@ -123,5 +123,5 @@ to obtain predictions from the pipeline.
.. _hyperparameters: ../advanced_usage/hyperparameters.html
.. _MLBlocks JSON Annotations: ../advanced_usage/primitives.html#json-annotations
.. _get_tunable_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_tunable_hyperparameters
-.. _BTB: https://github.com/HDI-Project/BTB
+.. _BTB: https://github.com/MLBazaar/BTB
.. _set_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.set_hyperparameters
diff --git a/docs/index.rst b/docs/index.rst
index e891230c..85717469 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -6,9 +6,9 @@ What is MLBlocks?
:alt: MLBlocks
:align: center
-* Free software: `MIT license `_
-* Documentation: https://HDI-Project.github.io/MLBlocks
-* Homepage: https://github.com/HDI-Project/MLBlocks
+* Free software: `MIT license `_
+* Documentation: https://mlbazaar.github.io/MLBlocks
+* Homepage: https://github.com/MLBazaar/MLBlocks
MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning
tools developed in Python, whether they are custom developments or belong to third party
diff --git a/docs/pipeline_examples/graph.rst b/docs/pipeline_examples/graph.rst
index 54ef85a1..8cde5340 100644
--- a/docs/pipeline_examples/graph.rst
+++ b/docs/pipeline_examples/graph.rst
@@ -69,6 +69,6 @@ additional information not found inside `X`.
.. _NetworkX Link Prediction: https://networkx.github.io/documentation/networkx-1.10/reference/algorithms.link_prediction.html
-.. _CategoricalEncoder from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json
+.. _CategoricalEncoder from MLPrimitives: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json
.. _StandardScaler from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
.. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
diff --git a/docs/pipeline_examples/image.rst b/docs/pipeline_examples/image.rst
index e8274761..b9b97ef7 100644
--- a/docs/pipeline_examples/image.rst
+++ b/docs/pipeline_examples/image.rst
@@ -136,7 +136,7 @@ to an `XGBRegressor`_ primitive.
.. _USPS Dataset: https://ieeexplore.ieee.org/document/291440/
.. _OpenCV GaussianBlur function: https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html?highlight=gaussianblur#gaussianblur
-.. _MLPrimitives primitive: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json
+.. _MLPrimitives primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json
.. _scikit-image function: http://scikit-image.org/docs/dev/api/skimage.feature.html#skimage.feature.hog
.. _RandomForestClassifier from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
.. _Pretrained Networks from Keras: https://keras.io/applications/
diff --git a/docs/pipeline_examples/multi_table.rst b/docs/pipeline_examples/multi_table.rst
index 109f4015..c2c2066f 100644
--- a/docs/pipeline_examples/multi_table.rst
+++ b/docs/pipeline_examples/multi_table.rst
@@ -49,5 +49,5 @@ tables are.
.. _WikiQA dataset: https://www.microsoft.com/en-us/research/publication/wikiqa-a-challenge-dataset-for-open-domain-question-answering/
.. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
-.. _DeepFeatureSynthesis: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/featuretools.dfs.json
+.. _DeepFeatureSynthesis: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/featuretools.dfs.json
.. _featuretools: https://www.featuretools.com/
diff --git a/docs/pipeline_examples/text.rst b/docs/pipeline_examples/text.rst
index 03472ea3..ee0c16ac 100644
--- a/docs/pipeline_examples/text.rst
+++ b/docs/pipeline_examples/text.rst
@@ -140,9 +140,9 @@ to encode all the string features, and go directly into the
.. _Twenty Newsgroups Dataset: http://scikit-learn.org/stable/datasets/twenty_newsgroups.html
-.. _TextCleaner primitive: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/text.py
-.. _StringVectorizer primitive: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/feature_extraction.py
+.. _TextCleaner primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlprimitives/text.py
+.. _StringVectorizer primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlprimitives/feature_extraction.py
.. _keras text preprocessing: https://keras.io/preprocessing/text/
-.. _Keras LSTM Classifier from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.LSTMTextClassifier.json
+.. _Keras LSTM Classifier from MLPrimitives: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.LSTMTextClassifier.json
.. _Personae Dataset: https://www.clips.uantwerpen.be/datasets/personae-corpus
.. _RandomForestClassifier from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
diff --git a/examples/README.md b/examples/README.md
index d295414e..de298ef2 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -26,7 +26,7 @@ In order to run these tutorials on your computer, please follow these steps:
1. Clone this github repository:
```bash
-git clone git@github.com:HDI-Project/MLBlocks.git
+git clone git@github.com:MLBazaar/MLBlocks.git
```
2. (Optional) Create a virtualenv to execute the examples in an environment isolated from the
@@ -45,8 +45,8 @@ cd MLBlocks
make install-examples
```
-This will install [MLBLocks](https://github.com/HDI-Project/MLBlocks.git) as well as [MLPrimitives](
-https://github.com/HDI-Project/MLPrimitives.git) and [Jupyter](https://jupyter.org/).
+This will install [MLBLocks](https://github.com/MLBazaar/MLBlocks.git) as well as [MLPrimitives](
+https://github.com/MLBazaar/MLPrimitives.git) and [Jupyter](https://jupyter.org/).
4. Enter the `examples` folder and start a Jupyter Notebook:
diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
index 5b7944b5..4993fd4e 100644
--- a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
+++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
@@ -122,7 +122,7 @@
"\n",
"**NOTE** that here we see the names of the pipeline steps, which are the primitive names with a numerical suffix that allows us to tell the difference between multiple steps that use the same primitive. \n",
"\n",
- "Alternatively, for better compatibility with tuning systems like [BTB](https://github.com/HDI-Project/BTB)\n",
+ "Alternatively, for better compatibility with tuning systems like [BTB](https://github.com/MLBazaar/BTB)\n",
"that work with flat, one-level, dictionaries, the argument `flat=True` can be passed."
]
},
diff --git a/examples/tutorials/7. Tuning a Pipeline.ipynb b/examples/tutorials/7. Tuning a Pipeline.ipynb
index 4b6eae24..ca30df17 100644
--- a/examples/tutorials/7. Tuning a Pipeline.ipynb
+++ b/examples/tutorials/7. Tuning a Pipeline.ipynb
@@ -6,7 +6,7 @@
"source": [
"# Tuning a Pipeline\n",
"\n",
- "This short guide shows how tune a Pipeline using a [BTB](https://github.com/HDI-Project/BTB) Tuner.\n",
+ "This short guide shows how tune a Pipeline using a [BTB](https://github.com/MLBazaar/BTB) Tuner.\n",
"\n",
"Note that some steps are not explained for simplicity. Full details\n",
"about them can be found in the previous parts of the tutorial.\n",
diff --git a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
index 1fb4d7ca..829a38d6 100644
--- a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
+++ b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
@@ -7,7 +7,7 @@
"# Selecting and Tuning pipelines\n",
"\n",
"This guide shows you how to search for multiple pipelines for your problem\n",
- "and later on use a [BTBSession](https://hdi-project.github.io/BTB/api/btb.session.html#btb.session.BTBSession)\n",
+ "and later on use a [BTBSession](https://mlbazaar.github.io/BTB/api/btb.session.html#btb.session.BTBSession)\n",
"to select and tune the best one.\n",
"\n",
"Note that some steps are not explained for simplicity. Full details\n",
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 618e7a55..300b9093 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -7,7 +7,7 @@
seamlessly combining tools from any python library with a simple, common and uniform interface.
* Free software: MIT license
-* Documentation: https://HDI-Project.github.io/MLBlocks
+* Documentation: https://MLBazaar.github.io/MLBlocks
"""
from mlblocks.discovery import (
diff --git a/setup.py b/setup.py
index 6ae0c75e..0c67cc8d 100644
--- a/setup.py
+++ b/setup.py
@@ -112,7 +112,7 @@
setup_requires=setup_requires,
test_suite='tests',
tests_require=tests_require,
- url='https://github.com/HDI-Project/MLBlocks',
+ url='https://github.com/MLBazaar/MLBlocks',
version='0.3.5.dev0',
zip_safe=False,
)
From b9a6142e77b50eae9ae1a3aad6eae8dc1e1f6e70 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sun, 13 Dec 2020 18:46:15 +0100
Subject: [PATCH 122/160] Prevent travis-ci conflict
---
setup.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/setup.py b/setup.py
index 0c67cc8d..4ff3a675 100644
--- a/setup.py
+++ b/setup.py
@@ -77,7 +77,10 @@
# Documentation style
'doc8>=0.8.0',
- 'pydocstyle>=3.0.0'
+ 'pydocstyle>=3.0.0',
+
+ # Prevent travis-ci conflict
+ 'chardet<4',
]
From 52653e072a17986da77c666fc5f2a73895f4b40b Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Mon, 14 Dec 2020 14:03:35 +0100
Subject: [PATCH 123/160] Update Travis badge
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 770f34ef..103fc113 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ Pipelines and Primitives for Machine Learning and Data Science.
[](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
[](https://pypi.python.org/pypi/mlblocks)
-[](https://travis-ci.org/MLBazaar/MLBlocks)
+[](https://travis-ci.com/MLBazaar/MLBlocks)
[](https://codecov.io/gh/MLBazaar/MLBlocks)
[](https://pepy.tech/project/mlblocks)
From c5f3fdfc3de21fffe0053c00fd7d6279243126a9 Mon Sep 17 00:00:00 2001
From: Plamen Valentinov Kolev
<41479552+pvk-developer@users.noreply.github.com>
Date: Tue, 22 Dec 2020 15:23:57 +0100
Subject: [PATCH 124/160] Add memory debug and profile (#130)
* Add code for memory consumption and optionally select which debug you would like to use.
* Add documentation about debuging
* Add psutil
* Tests updates
* Fix lint
* Add extra tests
* Update MLPrimitives version
* Rephrase documentation
---
docs/advanced_usage/pipelines.rst | 36 ++++++
mlblocks/mlpipeline.py | 189 ++++++++++++++++++------------
setup.py | 5 +-
tests/test_mlpipeline.py | 184 ++++++++++++++++++++++-------
4 files changed, 294 insertions(+), 120 deletions(-)
diff --git a/docs/advanced_usage/pipelines.rst b/docs/advanced_usage/pipelines.rst
index e87a0067..07b36c98 100644
--- a/docs/advanced_usage/pipelines.rst
+++ b/docs/advanced_usage/pipelines.rst
@@ -423,6 +423,42 @@ An example of this situation, where we want to reuse the output of the first blo
predictions = pipeline.predict(X_test)
score = compute_score(y_test, predictions)
+Pipeline debugging
+------------------
+
+Sometimes we might be interested in debugging a pipeline execution and obtain information
+about the time, the memory usage, the inputs and outputs that each step takes. This is possible
+by using the argument ``debug`` with the method ``fit`` and ``predict``. This argument allows us
+to retrieve critical information from the pipeline execution:
+
+* ``Time``: Elapsed time for the primitive and the given stage (fit or predict).
+* ``Memory``: Amount of memory increase or decrease for the given primitive for that pipeline.
+* ``Input``: The input values that the primitive takes for that specific step.
+* ``Output``: The output produced by the primitive.
+
+
+If the ``debug`` argument is set to ``True`` then a dictionary will be returned containing all the
+elements listed previously::
+
+ result, debug_info = pipeline.fit(X_train, y_train, debug=True)
+
+In case you want to retrieve only some of the elements listed above and skip the rest, you can
+pass an ``str`` to the ``debug`` argument with any combination of the following characters:
+
+* ``i``: To include inputs.
+* ``o``: To include outputs.
+* ``m``: To include used memory.
+* ``t``: To include elapsed time.
+
+For example, if we are only interested on capturing the elapsed time and used memory during the
+``fit`` process, we can call the method as follows::
+
+ result, debug_info = pipeline.fit(X_train, y_train, debug='tm')
+
+.. warning:: Bear in mind that if we use ``debug=True`` or saving the ``Input`` and ``Output``,
+ this will consume extra memory ram as it will create copies of the input data and
+ the output data for each primitive. For profiling it is recommended using the option
+ ``tm`` as shown in the previous example.
.. _API Reference: ../api_reference.html
.. _primitives: ../primitives.html
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 6e0744bd..a4111bcb 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -4,6 +4,7 @@
import json
import logging
+import os
import re
import warnings
from collections import Counter, OrderedDict, defaultdict
@@ -11,6 +12,7 @@
from datetime import datetime
import numpy as np
+import psutil
from graphviz import Digraph
from mlblocks.discovery import load_pipeline
@@ -110,14 +112,14 @@ def _build_blocks(self):
if not block_params:
block_params = self.init_params.get(primitive_name, dict())
if block_params and block_count > 1:
- LOGGER.warning(("Non-numbered init_params are being used "
- "for more than one block %s."), primitive_name)
+ LOGGER.warning(('Non-numbered init_params are being used '
+ 'for more than one block %s.'), primitive_name)
block = MLBlock(primitive, **block_params)
blocks[block_name] = block
except Exception:
- LOGGER.exception("Exception caught building MLBlock %s", primitive)
+ LOGGER.exception('Exception caught building MLBlock %s', primitive)
raise
return blocks
@@ -475,8 +477,8 @@ def _sanitize(cls, hyperparameters):
is a dict containing a complete hyperparameter specification for that block::
{
- "block_name": {
- "hyperparameter_name": "hyperparameter_value",
+ 'block_name': {
+ 'hyperparameter_name': 'hyperparameter_value',
...
},
...
@@ -487,7 +489,7 @@ def _sanitize(cls, hyperparameters):
second one::
{
- ("block_name", "hyperparameter_name"): "hyperparameter_value",
+ ('block_name', 'hyperparameter_name'): 'hyperparameter_value',
...
}
@@ -611,39 +613,52 @@ def _update_outputs(self, variable_name, output_variables, outputs, value):
index = output_variables.index(variable_name)
outputs[index] = deepcopy(value)
- def _fit_block(self, block, block_name, context, debug=None):
+ def _fit_block(self, block, block_name, context, debug_info=None):
"""Get the block args from the context and fit the block."""
- LOGGER.debug("Fitting block %s", block_name)
+ LOGGER.debug('Fitting block %s', block_name)
try:
fit_args = self._get_block_args(block_name, block.fit_args, context)
+ process = psutil.Process(os.getpid())
+ memory_before = process.memory_info().rss
start = datetime.utcnow()
block.fit(**fit_args)
elapsed = datetime.utcnow() - start
+ memory_after = process.memory_info().rss
- if debug is not None:
- debug["fit"][block_name] = {
- "elapsed": elapsed.total_seconds(),
- "input": fit_args
- }
+ if debug_info is not None:
+ debug = debug_info['debug']
+ record = {}
+ if 't' in debug:
+ record['time'] = elapsed.total_seconds()
+ if 'm' in debug:
+ record['memory'] = memory_after - memory_before
+ if 'i' in debug:
+ record['input'] = deepcopy(fit_args)
+
+ debug_info['fit'][block_name] = record
except Exception:
if self.verbose:
- LOGGER.exception("Exception caught fitting MLBlock %s", block_name)
+ LOGGER.exception('Exception caught fitting MLBlock %s', block_name)
raise
- def _produce_block(self, block, block_name, context, output_variables, outputs, debug=None):
+ def _produce_block(self, block, block_name, context, output_variables,
+ outputs, debug_info=None):
"""Get the block args from the context and produce the block.
Afterwards, set the block outputs back into the context and update
the outputs list if necessary.
"""
- LOGGER.debug("Producing block %s", block_name)
+ LOGGER.debug('Producing block %s', block_name)
try:
produce_args = self._get_block_args(block_name, block.produce_args, context)
+ process = psutil.Process(os.getpid())
+ memory_before = process.memory_info().rss
start = datetime.utcnow()
block_outputs = block.produce(**produce_args)
elapsed = datetime.utcnow() - start
+ memory_after = process.memory_info().rss
outputs_dict = self._extract_outputs(block_name, block_outputs, block.produce_output)
context.update(outputs_dict)
@@ -656,21 +671,23 @@ def _produce_block(self, block, block_name, context, output_variables, outputs,
variable_name = '{}.{}'.format(block_name, key)
self._update_outputs(variable_name, output_variables, outputs, value)
- if debug is not None:
- record = {
- "elapsed": elapsed.total_seconds(),
- "input": produce_args,
- "output": outputs_dict
- }
+ if debug_info is not None:
+ debug = debug_info['debug']
+ record = {}
+ if 't' in debug:
+ record['time'] = elapsed.total_seconds()
+ if 'm' in debug:
+ record['memory'] = memory_after - memory_before
+ if 'i' in debug:
+ record['input'] = deepcopy(produce_args)
+ if 'o' in debug:
+ record['output'] = deepcopy(outputs_dict)
- if "fit" in debug.keys():
- debug["produce"][block_name] = record
- else:
- debug[block_name] = record
+ debug_info['produce'][block_name] = record
except Exception:
if self.verbose:
- LOGGER.exception("Exception caught producing MLBlock %s", block_name)
+ LOGGER.exception('Exception caught producing MLBlock %s', block_name)
raise
@@ -692,21 +709,31 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
y:
Fit Data labels, which the pipeline will use to learn how to
behave.
-
output_ (str or int or list or None):
Output specification, as required by ``get_outputs``. If ``None`` is given,
nothing will be returned.
-
start_ (str or int or None):
Block index or block name to start processing from. The
value can either be an integer, which will be interpreted as a block index,
or the name of a block, including the conter number at the end.
If given, the execution of the pipeline will start on the specified block,
and all the blocks before that one will be skipped.
-
- debug (boolean):
- Debug mode, if True a dictionary containing the block names as keys and
- the execution time in seconds, input, output as values is returned.
+ debug (bool or str):
+ Debug a pipeline with the following options:
+
+ * ``t``:
+ Elapsed time for the primitive and the given stage (fit or predict).
+ * ``m``:
+ Amount of memory incrase (or decrease) for the primitive. This amount
+ is represented in bytes.
+ * ``i``:
+ The input values that the primitive takes for that step.
+ * ``o``:
+ The output values that the primitive generates.
+
+ If provided, return a dictionary with the ``fit`` and ``predict`` performance.
+ This argument can be a string containing a combination of the letters listed above,
+ or ``True`` which will return a complete debug.
**kwargs:
Any additional keyword arguments will be directly added
@@ -738,13 +765,14 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
debug_info = None
if debug:
debug_info = defaultdict(dict)
+ debug_info['debug'] = debug.lower() if isinstance(debug, str) else 'tmio'
for block_name, block in self.blocks.items():
if start_:
if block_name == start_:
start_ = False
else:
- LOGGER.debug("Skipping block %s fit", block_name)
+ LOGGER.debug('Skipping block %s fit', block_name)
continue
self._fit_block(block, block_name, context, debug_info)
@@ -770,13 +798,13 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
return result
- if debug:
- return debug_info
-
if start_:
# We skipped all the blocks up to the end
raise ValueError('Unknown block name: {}'.format(start_))
+ if debug:
+ return debug_info
+
def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs):
"""Produce predictions using the blocks of this pipeline.
@@ -791,21 +819,31 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs)
Args:
X:
Data which the pipeline will use to make predictions.
-
output_ (str or int or list or None):
Output specification, as required by ``get_outputs``. If not specified
the ``default`` output will be returned.
-
start_ (str or int or None):
Block index or block name to start processing from. The
value can either be an integer, which will be interpreted as a block index,
or the name of a block, including the conter number at the end.
If given, the execution of the pipeline will start on the specified block,
and all the blocks before that one will be skipped.
-
- debug (boolean):
- Debug mode, if True a dictionary containing the block names as keys and
- the execution time in seconds, input, output as values is returned.
+ debug (bool or str):
+ Debug a pipeline with the following options:
+
+ * ``t``:
+ Elapsed time for the primitive and the given stage (fit or predict).
+ * ``m``:
+ Amount of memory incrase (or decrease) for the primitive. This amount
+ is represented in bytes.
+ * ``i``:
+ The input values that the primitive takes for that step.
+ * ``o``:
+ The output values that the primitive generates.
+
+ If ``True`` then a dictionary will be returned containing all the elements listed
+ previously. If a ``string`` value with the combination of letters is given for
+ each option, it will return a dictionary with the selected elements.
**kwargs:
Any additional keyword arguments will be directly added
@@ -815,6 +853,9 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs)
object or tuple:
* If a single output is requested, it is returned alone.
* If multiple outputs have been requested, a tuple is returned.
+ * If ``debug`` is given, a tupple will be returned where the first element
+ returned are the predictions and the second a dictionary containing the debug
+ information.
"""
context = kwargs.copy()
if X is not None:
@@ -827,14 +868,15 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs)
debug_info = None
if debug:
- debug_info = dict()
+ debug_info = defaultdict(dict)
+ debug_info['debug'] = debug.lower() if isinstance(debug, str) else 'tmio'
for block_name, block in self.blocks.items():
if start_:
if block_name == start_:
start_ = False
else:
- LOGGER.debug("Skipping block %s produce", block_name)
+ LOGGER.debug('Skipping block %s produce', block_name)
continue
self._produce_block(block, block_name, context, output_variables, outputs, debug_info)
@@ -856,9 +898,6 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs)
return result
- if debug:
- return debug_info
-
if start_:
# We skipped all the blocks up to the end
raise ValueError('Unknown block name: {}'.format(start_))
@@ -871,32 +910,32 @@ def to_dict(self):
specification of the tunable_hyperparameters::
{
- "primitives": [
- "a_primitive",
- "another_primitive"
+ 'primitives': [
+ 'a_primitive',
+ 'another_primitive'
],
- "init_params": {
- "a_primitive": {
- "an_argument": "a_value"
+ 'init_params': {
+ 'a_primitive': {
+ 'an_argument': 'a_value'
}
},
- "hyperparameters": {
- "a_primitive#1": {
- "an_argument": "a_value",
- "another_argument": "another_value",
+ 'hyperparameters': {
+ 'a_primitive#1': {
+ 'an_argument': 'a_value',
+ 'another_argument': 'another_value',
},
- "another_primitive#1": {
- "yet_another_argument": "yet_another_value"
+ 'another_primitive#1': {
+ 'yet_another_argument': 'yet_another_value'
}
},
- "tunable_hyperparameters": {
- "another_primitive#1": {
- "yet_another_argument": {
- "type": "str",
- "default": "a_default_value",
- "values": [
- "a_default_value",
- "yet_another_value"
+ 'tunable_hyperparameters': {
+ 'another_primitive#1': {
+ 'yet_another_argument': {
+ 'type': 'str',
+ 'default': 'a_default_value',
+ 'values': [
+ 'a_default_value',
+ 'yet_another_value'
]
}
}
@@ -926,8 +965,8 @@ def _get_simple_block_name(self, block_name):
str:
block name stripped of number and other modifiers.
"""
- full_name = block_name.split("#")[0]
- simple_name = full_name.split(".")[-1]
+ full_name = block_name.split('#')[0]
+ simple_name = full_name.split('.')[-1]
return simple_name
def _get_context_name_from_variable(self, variable_name):
@@ -942,12 +981,12 @@ def _get_context_name_from_variable(self, variable_name):
str:
Name of the context of the variable.
"""
- block_name = variable_name.split("#")[0]
+ block_name = variable_name.split('#')[0]
rest = variable_name[len(block_name) + 1:]
- block_index = rest.split(".")[0]
+ block_index = rest.split('.')[0]
context_name = rest[len(block_index) + 1:]
if len(context_name) == 0:
- raise ValueError("Invalid variable name")
+ raise ValueError('Invalid variable name')
return context_name
def _get_relevant_output_variables(self, block_name, block, current_output_variables):
@@ -1107,7 +1146,7 @@ def _make_diagram_inputs(self, diagram, input_variables_blocks):
Dictionary of input variables of the pipeline and the set of tuples of blocks into
which the variable connects and the type of arrowhead to use
"""
- with diagram.subgraph(name="cluster_inputs") as cluster:
+ with diagram.subgraph(name='cluster_inputs') as cluster:
cluster.attr(tooltip='Input variables')
cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
cluster.attr('node', penwidth='0', fontsize='20')
@@ -1148,7 +1187,7 @@ def _make_diagram_outputs(self, diagram, outputs):
output_variables = []
outputs_vars = self.get_outputs(outputs)
- with diagram.subgraph(name="cluster_outputs") as cluster:
+ with diagram.subgraph(name='cluster_outputs') as cluster:
cluster.attr(tooltip='Output variables')
cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
cluster.attr('node', penwidth='0', fontsize='20')
diff --git a/setup.py b/setup.py
index 4ff3a675..d76236ae 100644
--- a/setup.py
+++ b/setup.py
@@ -17,12 +17,13 @@
install_requires = [
'graphviz>=0.9,<1',
'numpy>=1.17.1,<1.19',
+ 'psutil>=5,<6',
]
examples_require = [
'matplotlib>=2.2.2,<3.2.2',
- 'mlprimitives>=0.2.6.dev0,<0.3',
+ 'mlprimitives>=0.3.0.dev0,<0.4',
'boto3>=1.14,<1.14.45',
'botocore<1.17.45,>=1.17.44',
'jupyter==1.0.0',
@@ -34,7 +35,7 @@
tests_require = [
'pytest>=3.4.2',
'pytest-cov>=2.6.0',
- 'mlprimitives>=0.2.6.dev0,<0.3',
+ 'mlprimitives>=0.3.0.dev0,<0.4',
'setuptools>=41.0.0',
'rundoc>=0.4.3',
'prompt-toolkit>=2.0,<3.0',
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 59e11633..97c59cd0 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -696,7 +696,7 @@ def test_fit_no_debug(self):
assert returned is None
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
- def test_fit_debug(self):
+ def test_fit_debug_bool(self):
mlpipeline = MLPipeline(['a_primitive'])
mlpipeline.blocks['a_primitive#1'].fit_args = [
{
@@ -706,24 +706,53 @@ def test_fit_debug(self):
]
expected_return = dict()
- expected_return["fit"] = {
- "a_primitive#1": {
- "elapsed": 0,
- "input": {
- "whatever"
- }
+ expected_return['debug'] = 'tmio'
+ expected_return['fit'] = {
+ 'a_primitive#1': {
+ 'time': 0,
+ 'input': {
+ 'whatever'
+ },
+ 'memory': 0,
}
}
returned = mlpipeline.fit(debug=True)
- print(returned)
assert isinstance(returned, dict)
assert set(returned.keys()) == set(expected_return.keys()) # fit / produce
- assert set(returned["fit"].keys()) == set(expected_return["fit"].keys()) # block name
+ assert set(returned['fit'].keys()) == set(expected_return['fit'].keys()) # block name
+
+ for block_name, dictionary in expected_return['fit'].items():
+ assert set(returned['fit'][block_name].keys()) == set(dictionary.keys())
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_fit_debug_str(self):
+ mlpipeline = MLPipeline(['a_primitive'])
+ mlpipeline.blocks['a_primitive#1'].fit_args = [
+ {
+ 'name': 'fit_input',
+ 'type': 'whatever'
+ }
+ ]
+
+ expected_return = dict()
+ expected_return['debug'] = 'tm'
+ expected_return['fit'] = {
+ 'a_primitive#1': {
+ 'time': 0,
+ 'memory': 0,
+ }
+ }
+
+ returned = mlpipeline.fit(debug='tm')
+
+ assert isinstance(returned, dict)
+ assert set(returned.keys()) == set(expected_return.keys()) # fit / produce
+ assert set(returned['fit'].keys()) == set(expected_return['fit'].keys()) # block name
- for block_name, dictionary in expected_return["fit"].items():
- assert set(returned["fit"][block_name].keys()) == set(dictionary.keys())
+ for block_name, dictionary in expected_return['fit'].items():
+ assert set(returned['fit'][block_name].keys()) == set(dictionary.keys())
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_fit_produce_debug(self):
@@ -759,39 +788,104 @@ def test_fit_produce_debug(self):
]
expected_return = dict()
- expected_return["fit"] = {
- "a_primitive#1": {
- "elapsed": 0,
- "input": {
- "whatever"
- }
+ expected_return['debug'] = 'tmio'
+ expected_return['fit'] = {
+ 'a_primitive#1': {
+ 'time': 0,
+ 'input': {
+ 'whatever'
+ },
+ 'memory': 0,
}
}
- expected_return["produce"] = {
- "a_primitive#1": {
- "elapsed": 0,
- "input": {
- "whatever"
+ expected_return['produce'] = {
+ 'a_primitive#1': {
+ 'time': 0,
+ 'input': {
+ 'whatever'
},
- "output": {
- "whatever"
- }
+ 'output': {
+ 'whatever'
+ },
+ 'memory': 0,
}
}
returned, debug_returned = mlpipeline.fit(output_='default', debug=True)
- assert len([returned]) == len(outputs["default"])
+ assert len([returned]) == len(outputs['default'])
+ assert isinstance(debug_returned, dict)
+ assert set(debug_returned.keys()) == set(expected_return.keys()) # fit / produce
+ assert set(debug_returned['fit'].keys()) == set(expected_return['fit'].keys())
+ assert set(debug_returned['produce'].keys()) == set(expected_return['produce'].keys())
+
+ for block_name, dictionary in expected_return['fit'].items():
+ assert set(debug_returned['fit'][block_name].keys()) == set(dictionary.keys())
+
+ for block_name, dictionary in expected_return['produce'].items():
+ assert set(debug_returned['produce'][block_name].keys()) == set(dictionary.keys())
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_fit_produce_debug_str(self):
+ outputs = {
+ 'default': [
+ {
+ 'name': 'a_name',
+ 'variable': 'a_primitive#1.a_variable',
+ 'type': 'a_type',
+ }
+ ]
+ }
+ mlpipeline = MLPipeline(['a_primitive'], outputs=outputs)
+ mlpipeline.blocks['a_primitive#1'].fit_args = [
+ {
+ 'name': 'fit_input',
+ 'type': 'whatever'
+ }
+ ]
+
+ mlpipeline.blocks['a_primitive#1'].produce_args = [
+ {
+ 'name': 'input',
+ 'type': 'whatever'
+ }
+ ]
+
+ mlpipeline.blocks['a_primitive#1'].produce_output = [
+ {
+ 'name': 'a_name',
+ 'type': 'a_type'
+ }
+ ]
+
+ expected_return = dict()
+ expected_return['debug'] = 'tm'
+ expected_return['fit'] = {
+ 'a_primitive#1': {
+ 'time': 0,
+ 'memory': 0,
+ }
+ }
+ expected_return['produce'] = {
+ 'a_primitive#1': {
+ 'time': 0,
+ 'memory': 0,
+ }
+ }
+
+ returned, debug_returned = mlpipeline.fit(output_='default', debug='tm')
+
+ assert len([returned]) == len(outputs['default'])
assert isinstance(debug_returned, dict)
assert set(debug_returned.keys()) == set(expected_return.keys()) # fit / produce
- assert set(debug_returned["fit"].keys()) == set(expected_return["fit"].keys())
- assert set(debug_returned["produce"].keys()) == set(expected_return["produce"].keys())
+ assert set(debug_returned['fit'].keys()) == set(expected_return['fit'].keys())
+ assert set(debug_returned['produce'].keys()) == set(expected_return['produce'].keys())
- for block_name, dictionary in expected_return["fit"].items():
- assert set(debug_returned["fit"][block_name].keys()) == set(dictionary.keys())
+ for block_name, dictionary in expected_return['fit'].items():
+ assert set(debug_returned['fit'][block_name].keys()) == set(dictionary.keys())
- for block_name, dictionary in expected_return["produce"].items():
- assert set(debug_returned["produce"][block_name].keys()) == set(dictionary.keys())
+ for block_name, dictionary in expected_return['produce'].items():
+ assert set(debug_returned['produce'][block_name].keys()) == set(dictionary.keys())
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_predict_no_debug(self):
@@ -829,9 +923,9 @@ def test_predict_no_debug(self):
]
returned = mlpipeline.predict(debug=False)
- assert len(returned) == len(outputs["default"])
- for returned_output, expected_output in zip(returned, outputs["default"]):
- assert returned_output == expected_output["variable"]
+ assert len(returned) == len(outputs['default'])
+ for returned_output, expected_output in zip(returned, outputs['default']):
+ assert returned_output == expected_output['variable']
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_predict_debug(self):
@@ -861,18 +955,22 @@ def test_predict_debug(self):
expected_return = dict()
expected_return = {
- "a_primitive#1": {
- "elapsed": 0,
- "input": {
- "whatever"
+ 'a_primitive#1': {
+ 'time': 0,
+ 'input': {
+ 'whatever'
},
- "output": {
- "whatever"
- }
+ 'output': {
+ 'whatever'
+ },
+ 'memory': 0
}
}
+
returned, debug_returned = mlpipeline.predict(debug=True)
- assert len([returned]) == len(outputs["default"])
+ debug_returned = debug_returned['produce']
+
+ assert len([returned]) == len(outputs['default'])
assert isinstance(debug_returned, dict)
assert set(debug_returned.keys()) == set(expected_return.keys())
From 9f9c9a14f22e7d2f52e992562a5cc189c0ed12c8 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 22 Dec 2020 16:00:33 +0100
Subject: [PATCH 125/160] =?UTF-8?q?Bump=20version:=200.3.5.dev0=20?=
=?UTF-8?q?=E2=86=92=200.4.0.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 300b9093..08618880 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.3.5.dev0'
+__version__ = '0.4.0.dev0'
__all__ = [
'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 61208b1f..32db4562 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.3.5.dev0
+current_version = 0.4.0.dev0
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index d76236ae..a929025f 100644
--- a/setup.py
+++ b/setup.py
@@ -117,6 +117,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/MLBazaar/MLBlocks',
- version='0.3.5.dev0',
+ version='0.4.0.dev0',
zip_safe=False,
)
From 1af7b1bbc617beaab80f453eec01a145e8685032 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Tue, 22 Dec 2020 16:00:48 +0100
Subject: [PATCH 126/160] =?UTF-8?q?Bump=20version:=200.4.0.dev0=20?=
=?UTF-8?q?=E2=86=92=200.4.0.dev1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 08618880..e3d6fada 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.4.0.dev0'
+__version__ = '0.4.0.dev1'
__all__ = [
'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 32db4562..17998d88 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.4.0.dev0
+current_version = 0.4.0.dev1
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index a929025f..0eab74aa 100644
--- a/setup.py
+++ b/setup.py
@@ -117,6 +117,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/MLBazaar/MLBlocks',
- version='0.4.0.dev0',
+ version='0.4.0.dev1',
zip_safe=False,
)
From 84460489fc0a0fb2c762f3f16baf4c3e09d5056a Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com>
Date: Fri, 8 Jan 2021 10:16:22 -0500
Subject: [PATCH 127/160] Stop fitting pipeline after last fit block (#132)
* initial early stop
* change to stop after fitting the last block with attribute
* test early-stop calls
* remove comment
* change to fit pending
---
mlblocks/mlpipeline.py | 35 ++++++++++++++++++++---------
tests/test_mlpipeline.py | 48 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 73 insertions(+), 10 deletions(-)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index a4111bcb..d7935757 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -96,6 +96,7 @@ def _get_tunable_hyperparameters(self):
def _build_blocks(self):
blocks = OrderedDict()
+ last_fit_block = None
block_names_count = Counter()
for primitive in self.primitives:
@@ -118,11 +119,14 @@ def _build_blocks(self):
block = MLBlock(primitive, **block_params)
blocks[block_name] = block
+ if bool(block._fit):
+ last_fit_block = block_name
+
except Exception:
LOGGER.exception('Exception caught building MLBlock %s', primitive)
raise
- return blocks
+ return blocks, last_fit_block
@staticmethod
def _get_pipeline_dict(pipeline, primitives):
@@ -207,7 +211,7 @@ def __init__(self, pipeline=None, primitives=None, init_params=None,
self.primitives = primitives or pipeline['primitives']
self.init_params = init_params or pipeline.get('init_params', dict())
- self.blocks = self._build_blocks()
+ self.blocks, self._last_fit_block = self._build_blocks()
self._last_block_name = self._get_block_name(-1)
self.input_names = input_names or pipeline.get('input_names', dict())
@@ -767,7 +771,11 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
debug_info = defaultdict(dict)
debug_info['debug'] = debug.lower() if isinstance(debug, str) else 'tmio'
+ fit_pending = True
for block_name, block in self.blocks.items():
+ if block_name == self._last_fit_block:
+ fit_pending = False
+
if start_:
if block_name == start_:
start_ = False
@@ -777,7 +785,7 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
self._fit_block(block, block_name, context, debug_info)
- if (block_name != self._last_block_name) or (block_name in output_blocks):
+ if fit_pending or output_blocks:
self._produce_block(
block, block_name, context, output_variables, outputs, debug_info)
@@ -787,16 +795,23 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
# If there was an output_ but there are no pending
# outputs we are done.
- if output_variables is not None and not output_blocks:
- if len(outputs) > 1:
- result = tuple(outputs)
- else:
- result = outputs[0]
+ if output_variables:
+ if not output_blocks:
+ if len(outputs) > 1:
+ result = tuple(outputs)
+ else:
+ result = outputs[0]
+
+ if debug:
+ return result, debug_info
+
+ return result
+ elif not fit_pending:
if debug:
- return result, debug_info
+ return debug_info
- return result
+ return
if start_:
# We skipped all the blocks up to the end
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 97c59cd0..0ee4cf2c 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -681,6 +681,54 @@ def test_get_inputs_no_fit(self):
assert inputs == expected
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_fit_pending_all_primitives(self):
+ block_1 = get_mlblock_mock()
+ block_2 = get_mlblock_mock()
+ blocks = OrderedDict((
+ ('a.primitive.Name#1', block_1),
+ ('a.primitive.Name#2', block_2),
+ ))
+
+ self_ = MagicMock(autospec=MLPipeline)
+ self_.blocks = blocks
+ self_._last_fit_block = 'a.primitive.Name#2'
+
+ MLPipeline.fit(self_)
+
+ expected = [
+ call('a.primitive.Name#1'),
+ call('a.primitive.Name#2')
+ ]
+ self_._fit_block.call_args_list = expected
+
+ expected = [
+ call('a.primitive.Name#1'),
+ ]
+ self_._produce_block.call_args_list = expected
+
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_fit_pending_one_primitive(self):
+ block_1 = get_mlblock_mock()
+ block_2 = get_mlblock_mock()
+ blocks = OrderedDict((
+ ('a.primitive.Name#1', block_1),
+ ('a.primitive.Name#2', block_2),
+ ))
+
+ self_ = MagicMock(autospec=MLPipeline)
+ self_.blocks = blocks
+ self_._last_fit_block = 'a.primitive.Name#1'
+
+ MLPipeline.fit(self_)
+
+ expected = [
+ call('a.primitive.Name#1'),
+ ]
+ self_._fit_block.call_args_list = expected
+
+ assert not self_._produce_block.called
+
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_fit_no_debug(self):
mlpipeline = MLPipeline(['a_primitive'])
From 4c2a473c505524e10a850952961a66f35fa41b95 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sat, 9 Jan 2021 16:49:06 +0100
Subject: [PATCH 128/160] Migrate to gh actions (#133)
* Cleanup dependencies and migrate to gh-actions
* add mlprimitives extra
---
.github/workflows/docs.yml | 29 +++++++
.github/workflows/tests.yml | 79 +++++++++++++++++++
.travis.yml | 30 -------
Makefile | 45 +++++++++--
README.md | 21 +++--
apt.txt | 3 +
docs/api/mlblocks.datasets.rst | 5 --
docs/api/mlblocks.discovery.rst | 5 --
docs/index.rst | 4 +-
docs/pipeline_examples/graph.rst | 2 +-
docs/pipeline_examples/image.rst | 6 +-
docs/pipeline_examples/multi_table.rst | 2 +-
docs/pipeline_examples/single_table.rst | 6 +-
docs/pipeline_examples/text.rst | 4 +-
...or the best pipeline with BTBSession.ipynb | 2 +-
mlblocks/__init__.py | 16 +++-
requirements.txt | 2 +
setup.cfg | 1 -
setup.py | 23 +++---
tox.ini | 2 +
20 files changed, 197 insertions(+), 90 deletions(-)
create mode 100644 .github/workflows/docs.yml
create mode 100644 .github/workflows/tests.yml
delete mode 100644 .travis.yml
create mode 100644 apt.txt
delete mode 100644 docs/api/mlblocks.datasets.rst
delete mode 100644 docs/api/mlblocks.discovery.rst
create mode 100644 requirements.txt
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 00000000..7093b531
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,29 @@
+name: Generate Docs
+
+on:
+ push:
+ branches: [ stable ]
+
+jobs:
+
+ docs:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+
+ - name: Python
+ uses: actions/setup-python@v1
+ with:
+ python-version: 3.8
+
+ - name: Build
+ run: |
+ sudo apt-get install graphviz pandoc
+ python -m pip install --upgrade pip
+ pip install -e .[dev]
+ make docs
+ - name: Deploy
+ uses: peaceiris/actions-gh-pages@v3
+ with:
+ github_token: ${{secrets.GITHUB_TOKEN}}
+ publish_dir: docs/_build/html
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 00000000..ea2c37f5
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,79 @@
+name: Run Tests
+
+on:
+ push:
+ branches: [ '*' ]
+ pull_request:
+ branches: [ master ]
+
+jobs:
+ devel:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ python-version: [3.8]
+ os: [ubuntu-latest]
+ steps:
+ - uses: actions/checkout@v1
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v1
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install package
+ run: pip install .[dev]
+ - name: make test-devel
+ run: make test-devel
+
+ readme:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ python-version: [3.6, 3.7, 3.8]
+ os: [ubuntu-latest, macos-latest]
+ steps:
+ - uses: actions/checkout@v1
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v1
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install package and dependencies
+ run: pip install rundoc .[mlprimitives]
+ - name: make test-readme
+ run: make test-readme
+
+ unit:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ python-version: [3.6, 3.7, 3.8]
+ os: [ubuntu-latest, macos-latest]
+ steps:
+ - uses: actions/checkout@v1
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v1
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install package and dependencies
+ run: pip install .[test]
+ - name: make test-unit
+ run: make test-unit
+
+ tutorials:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ python-version: [3.6, 3.7, 3.8]
+ os: [ubuntu-latest]
+ steps:
+ - uses: actions/checkout@v1
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v1
+ with:
+ python-version: ${{ matrix.python-version }}
+ - if: matrix.os == 'ubuntu-latest'
+ name: Install dependencies - Ubuntu
+ run: sudo apt-get install graphviz
+ - name: Install package and dependencies
+ run: pip install .[examples]
+ - name: make test-tutorials
+ run: make test-tutorials
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index d2a982f2..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Config file for automatic testing at travis-ci.org
-dist: bionic
-language: python
-python:
- - 3.8
- - 3.7
- - 3.6
-
-# Command to install dependencies
-install:
- - sudo apt-get update
- - sudo apt-get install graphviz pandoc
- - pip install -U tox-travis codecov
-
-# Command to run tests
-script: travis_wait 60 tox
-
-after_success: codecov
-
-deploy:
-
- - provider: pages
- skip-cleanup: true
- github-token: "$GITHUB_TOKEN"
- keep-history: true
- local-dir: docs/_build/html
- target-branch: gh-pages
- on:
- branch: master
- python: 3.8
diff --git a/Makefile b/Makefile
index 6cc80705..c28da455 100644
--- a/Makefile
+++ b/Makefile
@@ -84,6 +84,12 @@ install-test: clean-build clean-pyc ## install the package and test dependencies
install-develop: clean-build clean-pyc ## install the package in editable mode and dependencies for development
pip install -e .[dev]
+MINIMUM := $(shell sed -n '/install_requires = \[/,/]/p' setup.py | grep -v -e '[][]' | sed 's/ *\(.*\),$?$$/\1/g' | tr '>' '=')
+
+.PHONY: install-minimum
+install-minimum: ## install the minimum supported versions of the package dependencies
+ pip install $(MINIMUM)
+
# LINT TARGETS
@@ -123,7 +129,7 @@ test-readme: ## run the readme snippets
.PHONY: test-tutorials
test-tutorials: ## run the tutorial notebooks
find examples/tutorials -path "*/.ipynb_checkpoints" -prune -false -o -name "*.ipynb" -exec \
- jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --stdout --to html {} > /dev/null \;
+ jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --stdout --to html {} > /dev/null +
.PHONY: test
test: test-unit test-readme ## test everything that needs test dependencies
@@ -154,11 +160,11 @@ docs: clean-docs ## generate Sphinx HTML documentation, including API docs
$(MAKE) -C docs html
.PHONY: view-docs
-view-docs: docs ## view docs in browser
+view-docs: ## view the docs in a browser
$(BROWSER) docs/_build/html/index.html
.PHONY: serve-docs
-serve-docs: view-docs ## compile the docs watching for changes
+serve-docs: ## compile the docs watching for changes
watchmedo shell-command -W -R -D -p '*.rst;*.md' -c '$(MAKE) -C docs html' docs
@@ -170,12 +176,19 @@ dist: clean ## builds source and wheel package
python setup.py bdist_wheel
ls -l dist
-.PHONY: test-publish
-test-publish: dist ## package and upload a release on TestPyPI
+.PHONY: publish-confirm
+publish-confirm:
+ @echo "WARNING: This will irreversibly upload a new version to PyPI!"
+ @echo -n "Please type 'confirm' to proceed: " \
+ && read answer \
+ && [ "$${answer}" = "confirm" ]
+
+.PHONY: publish-test
+publish-test: dist publish-confirm ## package and upload a release on TestPyPI
twine upload --repository-url https://test.pypi.org/legacy/ dist/*
.PHONY: publish
-publish: dist ## package and upload a release
+publish: dist publish-confirm ## package and upload a release
twine upload dist/*
.PHONY: bumpversion-release
@@ -204,9 +217,21 @@ bumpversion-minor: ## Bump the version the next minor skipping the release
bumpversion-major: ## Bump the version the next major skipping the release
bumpversion --no-tag major
+.PHONY: bumpversion-revert
+bumpversion-revert: ## Undo a previous bumpversion-release
+ git checkout master
+ git branch -D stable
+
+CLEAN_DIR := $(shell git status --short | grep -v ??)
CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null)
CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l)
+.PHONY: check-clean
+check-clean: ## Check if the directory has uncommitted changes
+ifneq ($(CLEAN_DIR),)
+ $(error There are uncommitted changes)
+endif
+
.PHONY: check-master
check-master: ## Check if we are in master branch
ifneq ($(CURRENT_BRANCH),master)
@@ -220,15 +245,21 @@ ifeq ($(CHANGELOG_LINES),0)
endif
.PHONY: check-release
-check-release: check-master check-history ## Check if the release can be made
+check-release: check-clean check-master check-history ## Check if the release can be made
@echo "A new release can be made"
.PHONY: release
release: check-release bumpversion-release publish bumpversion-patch
+.PHONY: release-test
+release-test: check-release bumpversion-release-test publish-test bumpversion-revert
+
.PHONY: release-candidate
release-candidate: check-master publish bumpversion-candidate
+.PHONY: release-candidate-test
+release-candidate-test: check-clean check-master publish-test
+
.PHONY: release-minor
release-minor: check-release bumpversion-minor release
diff --git a/README.md b/README.md
index 103fc113..4da013b0 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
-
-An open source project from Data to AI Lab at MIT.
+
+
+
+ An Open Source Project from the Data to AI Lab, at MIT
@@ -13,18 +15,19 @@ Pipelines and Primitives for Machine Learning and Data Science.
[](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
[](https://pypi.python.org/pypi/mlblocks)
-[](https://travis-ci.com/MLBazaar/MLBlocks)
+[](https://github.com/MLBazaar/MLBlocks/actions?query=workflow%3A%22Run+Tests%22+branch%3Amaster)
[](https://codecov.io/gh/MLBazaar/MLBlocks)
[](https://pepy.tech/project/mlblocks)
+[](https://mybinder.org/v2/gh/MLBazaar/MLBlocks/master?filepath=examples/tutorials)
# MLBlocks
-* Free software: [MIT license](https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE)
-* Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
* Documentation: https://mlbazaar.github.io/MLBlocks
-* Homepage: https://github.com/MLBazaar/MLBlocks
+* Github: https://github.com/MLBazaar/MLBlocks
+* License: [MIT](https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE)
+* Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
## Overview
@@ -49,11 +52,7 @@ Features include:
**MLBlocks** has been developed and tested on [Python 3.6, 3.7 and 3.8](https://www.python.org/downloads/)
-Also, although it is not strictly required, the usage of a
-[virtualenv](https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid
-interfering with other software installed in the system where **MLBlocks** is run.
-
-## Install with pip
+## Install with `pip`
The easiest and recommended way to install **MLBlocks** is using [pip](
https://pip.pypa.io/en/stable/):
diff --git a/apt.txt b/apt.txt
new file mode 100644
index 00000000..65387721
--- /dev/null
+++ b/apt.txt
@@ -0,0 +1,3 @@
+# apt-get requirements for development and mybinder environment
+graphviz
+pandoc
diff --git a/docs/api/mlblocks.datasets.rst b/docs/api/mlblocks.datasets.rst
deleted file mode 100644
index 6661cd8a..00000000
--- a/docs/api/mlblocks.datasets.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-mlblocks.datasets
-=================
-
-.. automodule:: mlblocks.datasets
- :members:
diff --git a/docs/api/mlblocks.discovery.rst b/docs/api/mlblocks.discovery.rst
deleted file mode 100644
index c9109130..00000000
--- a/docs/api/mlblocks.discovery.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-mlblocks.discovery
-==================
-
-.. automodule:: mlblocks.discovery
- :members:
diff --git a/docs/index.rst b/docs/index.rst
index 85717469..25567005 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -6,9 +6,9 @@ What is MLBlocks?
:alt: MLBlocks
:align: center
-* Free software: `MIT license `_
* Documentation: https://mlbazaar.github.io/MLBlocks
-* Homepage: https://github.com/MLBazaar/MLBlocks
+* Github: https://github.com/MLBazaar/MLBlocks
+* License: `MIT `_
MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning
tools developed in Python, whether they are custom developments or belong to third party
diff --git a/docs/pipeline_examples/graph.rst b/docs/pipeline_examples/graph.rst
index 8cde5340..082d12b6 100644
--- a/docs/pipeline_examples/graph.rst
+++ b/docs/pipeline_examples/graph.rst
@@ -30,7 +30,7 @@ additional information not found inside `X`.
.. code-block:: python
from mlblocks import MLPipeline
- from mlblocks.datasets import load_umls
+ from mlprimitives.datasets import load_umls
dataset = load_umls()
dataset.describe()
diff --git a/docs/pipeline_examples/image.rst b/docs/pipeline_examples/image.rst
index b9b97ef7..e892f915 100644
--- a/docs/pipeline_examples/image.rst
+++ b/docs/pipeline_examples/image.rst
@@ -24,7 +24,7 @@ Gradients using the corresponding `scikit-image function`_ to later on use a sim
.. code-block:: python
from mlblocks import MLPipeline
- from mlblocks.datasets import load_usps
+ from mlprimitives.datasets import load_usps
dataset = load_usps()
dataset.describe()
@@ -61,7 +61,7 @@ and directly after go into a Single Layer CNN Classifier built on Keras using th
.. code-block:: python
from mlblocks import MLPipeline
- from mlblocks.datasets import load_usps
+ from mlprimitives.datasets import load_usps
dataset = load_usps()
dataset.describe()
@@ -107,7 +107,7 @@ to an `XGBRegressor`_ primitive.
.. code-block:: python
from mlblocks import MLPipeline
- from mlblocks.datasets import load_handgeometry
+ from mlprimitives.datasets import load_handgeometry
dataset = load_handgeometry()
dataset.describe()
diff --git a/docs/pipeline_examples/multi_table.rst b/docs/pipeline_examples/multi_table.rst
index c2c2066f..7091a374 100644
--- a/docs/pipeline_examples/multi_table.rst
+++ b/docs/pipeline_examples/multi_table.rst
@@ -25,7 +25,7 @@ tables are.
.. code-block:: python
from mlblocks import MLPipeline
- from mlblocks.datasets import load_wikiqa
+ from mlprimitives.datasets import load_wikiqa
dataset = load_wikiqa()
dataset.describe()
diff --git a/docs/pipeline_examples/single_table.rst b/docs/pipeline_examples/single_table.rst
index ee00d9c6..6a031cb1 100644
--- a/docs/pipeline_examples/single_table.rst
+++ b/docs/pipeline_examples/single_table.rst
@@ -5,7 +5,7 @@ In this section we will go over a few pipeline examples to show **MLBlocks** wor
in different scenarios and with different types of data.
For each example, we will be using example datasets which can be downloaded using the
-various functions found in the ``mlblocks.datasets`` module.
+various functions found in the ``mlprimitives.datasets`` module.
.. note:: Even though the datasets are not especially big, some of the examples might
use a considerable amount of resources, especially memory, and might take
@@ -21,7 +21,7 @@ the numeric data from `The Boston Dataset`_, which we will load using the
.. code-block:: python
from mlblocks import MLPipeline
- from mlblocks.datasets import load_boston
+ from mlprimitives.datasets import load_boston
dataset = load_boston()
dataset.describe()
@@ -52,7 +52,7 @@ In this case, we will also be passing some initialization parameters for the XGB
.. code-block:: python
from mlblocks import MLPipeline
- from mlblocks.datasets import load_iris
+ from mlprimitives.datasets import load_iris
dataset = load_iris()
dataset.describe()
diff --git a/docs/pipeline_examples/text.rst b/docs/pipeline_examples/text.rst
index ee0c16ac..75ca3f39 100644
--- a/docs/pipeline_examples/text.rst
+++ b/docs/pipeline_examples/text.rst
@@ -28,7 +28,7 @@ for later ones.
import nltk
from mlblocks import MLPipeline
- from mlblocks.datasets import load_newsgroups
+ from mlprimitives.datasets import load_newsgroups
dataset = load_newsgroups()
dataset.describe()
@@ -105,7 +105,7 @@ to encode all the string features, and go directly into the
import nltk
from mlblocks import MLPipeline
- from mlblocks.datasets import load_personae
+ from mlprimitives.datasets import load_personae
dataset = load_personae()
dataset.describe()
diff --git a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
index 829a38d6..44431d4f 100644
--- a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
+++ b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
@@ -616,7 +616,7 @@
}
],
"source": [
- "session.run(20)"
+ "session.run(10)"
]
},
{
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index e3d6fada..8e4e6537 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -11,8 +11,8 @@
"""
from mlblocks.discovery import (
- add_pipelines_path, add_primitives_path, get_pipelines_paths, get_primitives_paths,
- load_pipeline, load_primitive)
+ add_pipelines_path, add_primitives_path, find_pipelines, find_primitives, get_pipelines_paths,
+ get_primitives_paths, load_pipeline, load_primitive)
from mlblocks.mlblock import MLBlock
from mlblocks.mlpipeline import MLPipeline
@@ -23,6 +23,14 @@
__version__ = '0.4.0.dev1'
__all__ = [
- 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
- 'get_pipelines_paths', 'get_primitives_paths', 'load_pipeline', 'load_primitive'
+ 'MLBlock',
+ 'MLPipeline',
+ 'add_pipelines_path',
+ 'add_primitives_path',
+ 'find_pipelines',
+ 'find_primitives',
+ 'get_pipelines_paths',
+ 'get_primitives_paths',
+ 'load_pipeline',
+ 'load_primitive'
]
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..3b01f6bf
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+# Requirements for development and mybinder environment
+-e .[dev]
diff --git a/setup.cfg b/setup.cfg
index 17998d88..969e1d64 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -33,7 +33,6 @@ exclude = .tox, .git, __pycache__, .ipynb_checkpoints
ignore = # Keep empty to prevent default ignores
[isort]
-include_trailing_comment = True
line_length = 99
lines_between_types = 0
multi_line_output = 4
diff --git a/setup.py b/setup.py
index 0eab74aa..c9068f3a 100644
--- a/setup.py
+++ b/setup.py
@@ -5,12 +5,10 @@
from setuptools import find_packages, setup
-
-with open('README.md') as readme_file:
+with open('README.md', encoding='utf-8') as readme_file:
readme = readme_file.read()
-
-with open('HISTORY.md') as history_file:
+with open('HISTORY.md', encoding='utf-8') as history_file:
history = history_file.read()
@@ -21,13 +19,12 @@
]
-examples_require = [
- 'matplotlib>=2.2.2,<3.2.2',
+mlprimitives_requires = [
'mlprimitives>=0.3.0.dev0,<0.4',
- 'boto3>=1.14,<1.14.45',
- 'botocore<1.17.45,>=1.17.44',
+]
+
+examples_require = mlprimitives_requires + [
'jupyter==1.0.0',
- 'docutils<0.16,>=0.10',
'baytune>=0.3.13.dev0,<0.4',
]
@@ -79,9 +76,6 @@
# Documentation style
'doc8>=0.8.0',
'pydocstyle>=3.0.0',
-
- # Prevent travis-ci conflict
- 'chardet<4',
]
@@ -98,16 +92,17 @@
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
],
- description="Pipelines and primitives for machine learning and data science.",
+ description='Pipelines and primitives for machine learning and data science.',
extras_require={
'dev': development_requires + tests_require + examples_require,
'test': tests_require + examples_require,
'examples': examples_require,
+ 'mlprimitives': mlprimitives_requires,
},
include_package_data=True,
install_requires=install_requires,
keywords='auto machine learning classification regression data science pipeline',
- license="MIT license",
+ license='MIT license',
long_description=readme + '\n\n' + history,
long_description_content_type='text/markdown',
name='mlblocks',
diff --git a/tox.ini b/tox.ini
index 1bc3f81a..229c1d54 100644
--- a/tox.ini
+++ b/tox.ini
@@ -14,8 +14,10 @@ skip_install = false
extras = test
commands =
/usr/bin/env make test
+ rm -r {envdir}
[testenv:test-devel]
extras = dev
commands =
/usr/bin/env make test-devel
+ rm -r {envdir}
From 6dbcda49319047b1dcf339f5c00b830b61a8ed29 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sat, 9 Jan 2021 17:19:37 +0100
Subject: [PATCH 129/160] Fix dependency conflicts
---
requirements.txt | 1 +
setup.py | 6 ++++--
tox.ini | 1 +
3 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/requirements.txt b/requirements.txt
index 3b01f6bf..d2ce3888 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
# Requirements for development and mybinder environment
-e .[dev]
+docutils<0.16,>=0.10 # Fix dependency conflict on mybinder
diff --git a/setup.py b/setup.py
index c9068f3a..91edced6 100644
--- a/setup.py
+++ b/setup.py
@@ -20,12 +20,14 @@
mlprimitives_requires = [
- 'mlprimitives>=0.3.0.dev0,<0.4',
+ 'mlprimitives>=0.3.0,<0.4',
+ 'h5py<2.11.0,>=2.10.0', # <- tensorflow 2.3.2 conflict
+ 'matplotlib<3.2.2,>=2.2.2', # <- copulas 0.3.3
]
examples_require = mlprimitives_requires + [
'jupyter==1.0.0',
- 'baytune>=0.3.13.dev0,<0.4',
+ 'baytune>=0.4.0,<0.5',
]
diff --git a/tox.ini b/tox.ini
index 229c1d54..e38f071b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -9,6 +9,7 @@ python =
[testenv]
passenv = CI TRAVIS TRAVIS_*
+allowlist_externals = rm
skipsdist = false
skip_install = false
extras = test
From 2c1e9a3f83bcb937a630b440dbbcef83db4eff4d Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sat, 9 Jan 2021 17:20:33 +0100
Subject: [PATCH 130/160] Add release notes for v0.4.0
---
HISTORY.md | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/HISTORY.md b/HISTORY.md
index 17bbda92..da082c25 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,17 @@
Changelog
=========
+0.4.0 - 2021-01-09
+------------------
+
+* Stop pipeline fitting after the last block - [Issue #131](https://github.com/MLBazaar/MLBlocks/issues/131) by @sarahmish
+* Add memory debug and profiling - [Issue #130](https://github.com/MLBazaar/MLBlocks/issues/130) by @pvk-developer
+* Update Python support - [Issue #129](https://github.com/MLBazaar/MLBlocks/issues/129) by @csala
+* Get execution time for each block - [Issue #127](https://github.com/MLBazaar/MLBlocks/issues/127) by @sarahmish
+* Allow loading a primitive or pipeline directly from the JSON path - [Issue #114](https://github.com/MLBazaar/MLBlocks/issues/114) by @csala
+* Pipeline Diagrams - [Issue #113](https://github.com/MLBazaar/MLBlocks/issues/113) by @erica-chiu
+* Get Pipeline Inputs - [Issue #112](https://github.com/MLBazaar/MLBlocks/issues/112) by @erica-chiu
+
0.3.4 - 2019-11-01
------------------
From 04bb5fc72f55a9e2f439bed2d4ec3ae6537f52f4 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sat, 9 Jan 2021 17:22:42 +0100
Subject: [PATCH 131/160] =?UTF-8?q?Bump=20version:=200.4.0.dev1=20?=
=?UTF-8?q?=E2=86=92=200.4.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 8e4e6537..28a80c5d 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.4.0.dev1'
+__version__ = '0.4.0'
__all__ = [
'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 969e1d64..dc027074 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.4.0.dev1
+current_version = 0.4.0
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index 91edced6..0c05c20b 100644
--- a/setup.py
+++ b/setup.py
@@ -114,6 +114,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/MLBazaar/MLBlocks',
- version='0.4.0.dev1',
+ version='0.4.0',
zip_safe=False,
)
From ae9653bfd0ae3e9798071d8bec311cee4e396804 Mon Sep 17 00:00:00 2001
From: Carles Sala
Date: Sat, 9 Jan 2021 17:22:55 +0100
Subject: [PATCH 132/160] =?UTF-8?q?Bump=20version:=200.4.0=20=E2=86=92=200?=
=?UTF-8?q?.4.1.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 28a80c5d..61438750 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.4.0'
+__version__ = '0.4.1.dev0'
__all__ = [
'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index dc027074..96b72ce1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.4.0
+current_version = 0.4.1.dev0
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index 0c05c20b..db8f5aa6 100644
--- a/setup.py
+++ b/setup.py
@@ -114,6 +114,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/MLBazaar/MLBlocks',
- version='0.4.0',
+ version='0.4.1.dev0',
zip_safe=False,
)
From 098302e83d17d05425bf546077805738abeaebc7 Mon Sep 17 00:00:00 2001
From: Plamen Valentinov Kolev
<41479552+pvk-developer@users.noreply.github.com>
Date: Mon, 8 Mar 2021 16:27:05 +0100
Subject: [PATCH 133/160] Implement dynamic inputs and outputs. (#135)
* Implement dynamic inputs and outputs.
* Recover block_outputs if it's a string from the block's instance.
* Update tests
---
mlblocks/mlblock.py | 13 +++-
mlblocks/mlpipeline.py | 16 +++++
tests/test_mlpipeline.py | 142 ++++++++++++++++++++++++++++++---------
3 files changed, 136 insertions(+), 35 deletions(-)
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index f570165b..d2295722 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -111,8 +111,15 @@ def _extract_params(self, kwargs, hyperparameters):
if name in kwargs:
init_params[name] = kwargs.pop(name)
- fit_args = [arg['name'] for arg in self.fit_args]
- produce_args = [arg['name'] for arg in self.produce_args]
+ if not isinstance(self.fit_args, str):
+ fit_args = [arg['name'] for arg in self.fit_args]
+ else:
+ fit_args = []
+
+ if not isinstance(self.produce_args, str):
+ produce_args = [arg['name'] for arg in self.produce_args]
+ else:
+ produce_args = []
for name in list(kwargs.keys()):
if name in fit_args:
@@ -257,6 +264,8 @@ def _get_method_kwargs(self, kwargs, method_args):
A dictionary containing the argument names and values to pass
to the primitive method.
"""
+ if isinstance(method_args, str):
+ method_args = getattr(self.instance, method_args)()
method_kwargs = dict()
for arg in method_args:
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index d7935757..738b13b0 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -177,6 +177,9 @@ def _get_block_variables(self, block_name, variables_attr, names):
"""
block = self.blocks[block_name]
variables = deepcopy(getattr(block, variables_attr))
+ if isinstance(variables, str):
+ variables = getattr(block.instance, variables)()
+
variable_dict = {}
for variable in variables:
name = variable['name']
@@ -300,6 +303,12 @@ def get_inputs(self, fit=True):
return inputs
+ def get_fit_args(self):
+ return list(self.get_inputs(fit=True).values())
+
+ def get_predict_args(self):
+ return list(self.get_inputs(fit=False).values())
+
def get_outputs(self, outputs='default'):
"""Get the list of output variables that correspond to the specified outputs.
@@ -578,6 +587,10 @@ def _get_block_args(self, block_name, block_args, context):
input_names = self.input_names.get(block_name, dict())
+ if isinstance(block_args, str):
+ block = self.blocks[block_name]
+ block_args = getattr(block.instance, block_args)()
+
kwargs = dict()
for arg in block_args:
name = arg['name']
@@ -591,6 +604,9 @@ def _get_block_args(self, block_name, block_args, context):
def _extract_outputs(self, block_name, outputs, block_outputs):
"""Extract the outputs of the method as a dict to be set into the context."""
# TODO: type validation and/or transformation should be done here
+ if isinstance(block_outputs, str):
+ block = self.blocks[block_name]
+ block_outputs = getattr(block.instance, block_outputs)()
if not isinstance(outputs, tuple):
outputs = (outputs, )
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 0ee4cf2c..be8c6f6b 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -381,6 +381,7 @@ def test_get_outputs_str_named(self):
]
}
pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs)
+
returned = pipeline.get_outputs('debug')
expected = [
@@ -389,13 +390,11 @@ def test_get_outputs_str_named(self):
'variable': 'another_variable',
}
]
-
assert returned == expected
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_get_outputs_str_variable(self):
pipeline = MLPipeline(['a_primitive', 'another_primitive'])
-
pipeline.blocks['a_primitive#1'].produce_output = [
{
'name': 'output',
@@ -412,7 +411,6 @@ def test_get_outputs_str_variable(self):
'variable': 'a_primitive#1.output'
}
]
-
assert returned == expected
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
@@ -427,7 +425,6 @@ def test_get_outputs_str_block(self):
'variable': 'a_primitive#1',
}
]
-
assert returned == expected
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
@@ -442,7 +439,6 @@ def test_get_outputs_int(self):
'variable': 'another_primitive#1',
}
]
-
assert returned == expected
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
@@ -463,7 +459,6 @@ def test_get_outputs_combination(self):
]
}
pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs)
-
pipeline.blocks['a_primitive#1'].produce_output = [
{
'name': 'output',
@@ -498,7 +493,6 @@ def test_get_outputs_combination(self):
'variable': 'a_primitive#1.output'
}
]
-
assert returned == expected
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
@@ -550,21 +544,39 @@ def test_get_output_variables(self):
assert names == ['a_variable']
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
- def test__get_block_variables(self):
+ def test__get_block_variables_is_dict(self):
+ pipeline = MLPipeline(['a_primitive'])
+ pipeline.blocks['a_primitive#1'].produce_outputs = [
+ {
+ 'name': 'output',
+ 'type': 'whatever'
+ }
+ ]
+
+ outputs = pipeline._get_block_variables(
+ 'a_primitive#1',
+ 'produce_outputs',
+ {'output': 'name_output'}
+ )
+
expected = {
'name_output': {
'name': 'output',
'type': 'whatever',
}
}
+ assert outputs == expected
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test__get_block_variables_is_str(self):
pipeline = MLPipeline(['a_primitive'])
-
- pipeline.blocks['a_primitive#1'].produce_outputs = [
+ pipeline.blocks['a_primitive#1'].produce_outputs = 'get_produce_outputs'
+ pipeline.blocks['a_primitive#1'].instance.get_produce_outputs.return_value = [
{
- 'name': 'output',
- 'type': 'whatever'
+ 'name': 'output_from_function',
+ 'type': 'test'
}
+
]
outputs = pipeline._get_block_variables(
@@ -572,10 +584,50 @@ def test__get_block_variables(self):
'produce_outputs',
{'output': 'name_output'}
)
+
+ expected = {
+ 'output_from_function': {
+ 'name': 'output_from_function',
+ 'type': 'test',
+ }
+ }
assert outputs == expected
+ pipeline.blocks['a_primitive#1'].instance.get_produce_outputs.assert_called_once_with()
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_get_inputs_fit(self):
+ pipeline = MLPipeline(['a_primitive', 'another_primitive'])
+ pipeline.blocks['a_primitive#1'].produce_args = [
+ {
+ 'name': 'input',
+ 'type': 'whatever'
+ }
+ ]
+ pipeline.blocks['a_primitive#1'].fit_args = [
+ {
+ 'name': 'fit_input',
+ 'type': 'whatever'
+ }
+ ]
+ pipeline.blocks['a_primitive#1'].produce_output = [
+ {
+ 'name': 'output',
+ 'type': 'another_whatever'
+ }
+ ]
+ pipeline.blocks['another_primitive#1'].produce_args = [
+ {
+ 'name': 'output',
+ 'type': 'another_whatever'
+ },
+ {
+ 'name': 'another_input',
+ 'type': 'another_whatever'
+ }
+ ]
+
+ inputs = pipeline.get_inputs()
+
expected = {
'input': {
'name': 'input',
@@ -589,32 +641,30 @@ def test_get_inputs_fit(self):
'name': 'another_input',
'type': 'another_whatever',
}
-
}
+ assert inputs == expected
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_get_inputs_no_fit(self):
pipeline = MLPipeline(['a_primitive', 'another_primitive'])
-
pipeline.blocks['a_primitive#1'].produce_args = [
{
'name': 'input',
'type': 'whatever'
}
]
-
pipeline.blocks['a_primitive#1'].fit_args = [
{
'name': 'fit_input',
'type': 'whatever'
}
]
-
pipeline.blocks['a_primitive#1'].produce_output = [
{
'name': 'output',
'type': 'another_whatever'
}
]
-
pipeline.blocks['another_primitive#1'].produce_args = [
{
'name': 'output',
@@ -626,11 +676,8 @@ def test_get_inputs_fit(self):
}
]
- inputs = pipeline.get_inputs()
- assert inputs == expected
+ inputs = pipeline.get_inputs(fit=False)
- @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
- def test_get_inputs_no_fit(self):
expected = {
'input': {
'name': 'input',
@@ -640,25 +687,24 @@ def test_get_inputs_no_fit(self):
'name': 'another_input',
'type': 'another_whatever',
}
-
}
+ assert inputs == expected
- pipeline = MLPipeline(['a_primitive', 'another_primitive'])
-
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_get_fit_args(self):
+ pipeline = MLPipeline(['a_primitive'])
pipeline.blocks['a_primitive#1'].produce_args = [
{
'name': 'input',
'type': 'whatever'
}
]
-
pipeline.blocks['a_primitive#1'].fit_args = [
{
'name': 'fit_input',
'type': 'whatever'
}
]
-
pipeline.blocks['a_primitive#1'].produce_output = [
{
'name': 'output',
@@ -666,20 +712,50 @@ def test_get_inputs_no_fit(self):
}
]
- pipeline.blocks['another_primitive#1'].produce_args = [
+ outputs = pipeline.get_fit_args()
+
+ expected = [
{
- 'name': 'output',
- 'type': 'another_whatever'
+ 'name': 'input',
+ 'type': 'whatever'
},
{
- 'name': 'another_input',
- 'type': 'another_whatever'
+ 'name': 'fit_input',
+ 'type': 'whatever',
}
]
+ assert outputs == expected
- inputs = pipeline.get_inputs(fit=False)
+ @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+ def test_get_predict_args(self):
+ pipeline = MLPipeline(['a_primitive'])
+ pipeline.blocks['a_primitive#1'].produce_args = [
+ {
+ 'name': 'input',
+ 'type': 'whatever'
+ }
+ ]
+ pipeline.blocks['a_primitive#1'].fit_args = [
+ {
+ 'name': 'fit_input',
+ 'type': 'whatever'
+ }
+ ]
+ pipeline.blocks['a_primitive#1'].produce_output = [
+ {
+ 'name': 'output',
+ 'type': 'another_whatever'
+ }
+ ]
+ outputs = pipeline.get_predict_args()
- assert inputs == expected
+ expected = [
+ {
+ 'name': 'input',
+ 'type': 'whatever'
+ }
+ ]
+ assert outputs == expected
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_fit_pending_all_primitives(self):
From 286e0f207d569eff4d2b1a52aeb128965a5372a7 Mon Sep 17 00:00:00 2001
From: Plamen Valentinov Kolev
Date: Mon, 8 Mar 2021 18:08:19 +0100
Subject: [PATCH 134/160] =?UTF-8?q?Bump=20version:=200.4.1.dev0=20?=
=?UTF-8?q?=E2=86=92=200.4.1.dev1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 61438750..5e8d665e 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.4.1.dev0'
+__version__ = '0.4.1.dev1'
__all__ = [
'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 96b72ce1..e75ffe48 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.4.1.dev0
+current_version = 0.4.1.dev1
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index db8f5aa6..a48b031f 100644
--- a/setup.py
+++ b/setup.py
@@ -114,6 +114,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/MLBazaar/MLBlocks',
- version='0.4.1.dev0',
+ version='0.4.1.dev1',
zip_safe=False,
)
From ae1cdd66a10bb0e6341ab716e1fdb7ca7fc51bae Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com>
Date: Mon, 27 Sep 2021 10:38:04 -0400
Subject: [PATCH 135/160] Update dependencies (#136)
* Increase numpy cap
---
setup.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/setup.py b/setup.py
index a48b031f..78f4053a 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
install_requires = [
'graphviz>=0.9,<1',
- 'numpy>=1.17.1,<1.19',
+ 'numpy>=1.17.1,<1.21',
'psutil>=5,<6',
]
From 3585628764bcb0bb2e06348eed4a90da5df3d4df Mon Sep 17 00:00:00 2001
From: sarahmish
Date: Fri, 8 Oct 2021 10:55:56 -0400
Subject: [PATCH 136/160] =?UTF-8?q?Bump=20version:=200.4.1.dev1=20?=
=?UTF-8?q?=E2=86=92=200.4.1.dev2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 5e8d665e..f3ead991 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.4.1.dev1'
+__version__ = '0.4.1.dev2'
__all__ = [
'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index e75ffe48..b106c1e6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.4.1.dev1
+current_version = 0.4.1.dev2
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index 78f4053a..6a193b32 100644
--- a/setup.py
+++ b/setup.py
@@ -114,6 +114,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/MLBazaar/MLBlocks',
- version='0.4.1.dev1',
+ version='0.4.1.dev2',
zip_safe=False,
)
From e8d353da3bf2585d4cbed40f07dda93529690196 Mon Sep 17 00:00:00 2001
From: sarahmish
Date: Fri, 8 Oct 2021 12:06:57 -0400
Subject: [PATCH 137/160] prepare release notes
---
HISTORY.md | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/HISTORY.md b/HISTORY.md
index da082c25..0575c034 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,12 @@
Changelog
=========
+0.4.1 - 2021-10-08
+------------------
+
+* Update NumPy dependency - [Issue #136](https://github.com/MLBazaar/MLBlocks/issues/136) by @sarahmish
+* Support dynamic inputs and outputs - [Issue #134](https://github.com/MLBazaar/MLBlocks/issues/134) by @pvk-developer
+
0.4.0 - 2021-01-09
------------------
From 16ba53c557a770760bb46fbf17566891a258cdb3 Mon Sep 17 00:00:00 2001
From: sarahmish
Date: Fri, 8 Oct 2021 12:07:26 -0400
Subject: [PATCH 138/160] =?UTF-8?q?Bump=20version:=200.4.1.dev2=20?=
=?UTF-8?q?=E2=86=92=200.4.1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index f3ead991..9c9d5d13 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.4.1.dev2'
+__version__ = '0.4.1'
__all__ = [
'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index b106c1e6..84f59fab 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.4.1.dev2
+current_version = 0.4.1
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index 6a193b32..b7c717be 100644
--- a/setup.py
+++ b/setup.py
@@ -114,6 +114,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/MLBazaar/MLBlocks',
- version='0.4.1.dev2',
+ version='0.4.1',
zip_safe=False,
)
From 515d0a7af4e6466014333eace818d3a64a2ce46b Mon Sep 17 00:00:00 2001
From: sarahmish
Date: Fri, 8 Oct 2021 12:07:41 -0400
Subject: [PATCH 139/160] =?UTF-8?q?Bump=20version:=200.4.1=20=E2=86=92=200?=
=?UTF-8?q?.4.2.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 9c9d5d13..9c42ed1a 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.4.1'
+__version__ = '0.4.2.dev0'
__all__ = [
'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 84f59fab..fc9e4e12 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.4.1
+current_version = 0.4.2.dev0
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index b7c717be..c0432aa4 100644
--- a/setup.py
+++ b/setup.py
@@ -114,6 +114,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/MLBazaar/MLBlocks',
- version='0.4.1',
+ version='0.4.2.dev0',
zip_safe=False,
)
From 79fc8fbc4632f164102c4973badd13cd38c31e84 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com>
Date: Sun, 11 Dec 2022 16:12:08 -0600
Subject: [PATCH 140/160] Update `numpy` dependency (#139)
* push numpy cap
* add separate tests for mlblocks
* fix command
* create new unit test environment
* pin jinja2
* pin markupsafe
* add docutils
* pin scikit learn for docs
* unpin scikit-learn and add okwarning
---
.github/workflows/tests.yml | 35 +++++++++++++++++++++--------
Makefile | 6 ++++-
docs/getting_started/quickstart.rst | 2 ++
setup.py | 8 +++++--
tests/test_mlpipeline.py | 6 ++---
5 files changed, 42 insertions(+), 15 deletions(-)
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index ea2c37f5..4cb525ed 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -16,7 +16,7 @@ jobs:
steps:
- uses: actions/checkout@v1
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v1
+ uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install package
@@ -29,11 +29,11 @@ jobs:
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
- os: [ubuntu-latest, macos-latest]
+ os: [ubuntu-20.04, macos-latest]
steps:
- uses: actions/checkout@v1
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v1
+ uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install package and dependencies
@@ -46,31 +46,48 @@ jobs:
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
- os: [ubuntu-latest, macos-latest]
+ os: [ubuntu-20.04, macos-latest]
steps:
- uses: actions/checkout@v1
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v1
+ uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install package and dependencies
- run: pip install .[test]
+ run: pip install .[unit]
- name: make test-unit
run: make test-unit
+ unit-mlprimitives:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ python-version: [3.6, 3.7, 3.8]
+ os: [ubuntu-20.04, macos-latest]
+ steps:
+ - uses: actions/checkout@v1
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install package and dependencies
+ run: pip install .[test]
+ - name: make test-mlprimitives
+ run: make test-mlprimitives
+
tutorials:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
- os: [ubuntu-latest]
+ os: [ubuntu-20.04]
steps:
- uses: actions/checkout@v1
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v1
+ uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- - if: matrix.os == 'ubuntu-latest'
+ - if: matrix.os == 'ubuntu-20.04'
name: Install dependencies - Ubuntu
run: sudo apt-get install graphviz
- name: Install package and dependencies
diff --git a/Makefile b/Makefile
index c28da455..2ae6c7c3 100644
--- a/Makefile
+++ b/Makefile
@@ -118,6 +118,10 @@ lint-docs: ## check docs formatting with doc8 and pydocstyle
.PHONY: test-unit
test-unit: ## run tests quickly with the default Python
+ python -m pytest --cov=mlblocks --ignore=tests/features/
+
+.PHONY: test-mlprimitives
+test-mlprimitives: ## run tests quickly with the default Python
python -m pytest --cov=mlblocks
.PHONY: test-readme
@@ -132,7 +136,7 @@ test-tutorials: ## run the tutorial notebooks
jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --stdout --to html {} > /dev/null +
.PHONY: test
-test: test-unit test-readme ## test everything that needs test dependencies
+test: test-unit test-mlprimitives test-readme ## test everything that needs test dependencies
.PHONY: check-dependencies
check-dependencies: ## test if there are any broken dependencies
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
index 386752dc..f0cb9a3f 100644
--- a/docs/getting_started/quickstart.rst
+++ b/docs/getting_started/quickstart.rst
@@ -102,6 +102,7 @@ To do this, we first call the ``fit`` method passing the training data and the c
labels.
.. ipython:: python
+ :okwarning:
from mlprimitives.datasets import load_census
dataset = load_census()
@@ -112,6 +113,7 @@ Once we have fitted our model to our data, we can call the ``predict`` method pa
to obtain predictions from the pipeline.
.. ipython:: python
+ :okwarning:
predictions = pipeline.predict(X_test)
predictions
diff --git a/setup.py b/setup.py
index c0432aa4..85b05bcd 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
install_requires = [
'graphviz>=0.9,<1',
- 'numpy>=1.17.1,<1.21',
+ 'numpy>=1.17.1,<2',
'psutil>=5,<6',
]
@@ -23,6 +23,7 @@
'mlprimitives>=0.3.0,<0.4',
'h5py<2.11.0,>=2.10.0', # <- tensorflow 2.3.2 conflict
'matplotlib<3.2.2,>=2.2.2', # <- copulas 0.3.3
+ 'protobuf<4', # <- importlib
]
examples_require = mlprimitives_requires + [
@@ -34,7 +35,6 @@
tests_require = [
'pytest>=3.4.2',
'pytest-cov>=2.6.0',
- 'mlprimitives>=0.3.0.dev0,<0.4',
'setuptools>=41.0.0',
'rundoc>=0.4.3',
'prompt-toolkit>=2.0,<3.0',
@@ -56,8 +56,11 @@
'm2r>=0.2.0,<0.3',
'Sphinx>=1.7.1,<3',
'sphinx_rtd_theme>=0.2.4,<0.5',
+ 'docutils>=0.12,<0.18',
'ipython>=6.5.0',
'autodocsumm>=0.1.10',
+ 'Jinja2>=2,<3', # >=3 makes sphinx theme fail
+ 'markupsafe<2.1.0',
# style check
'flake8>=3.7.7,<4',
@@ -97,6 +100,7 @@
description='Pipelines and primitives for machine learning and data science.',
extras_require={
'dev': development_requires + tests_require + examples_require,
+ 'unit': tests_require,
'test': tests_require + examples_require,
'examples': examples_require,
'mlprimitives': mlprimitives_requires,
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index be8c6f6b..084eac3d 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -1124,7 +1124,7 @@ def test_get_diagram_simple(self):
]
pipeline.blocks['a_primitive#1'].produce_output = output
- assert str(pipeline.get_diagram()) == expected
+ assert str(pipeline.get_diagram()).strip() == expected.strip()
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_get_diagram_fit(self):
@@ -1155,7 +1155,7 @@ def test_get_diagram_fit(self):
]
pipeline.blocks['a_primitive#1'].produce_output = output
- assert str(pipeline.get_diagram()) == expected
+ assert str(pipeline.get_diagram()).strip() == expected.strip()
@patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
def test_get_diagram_multiple_blocks(self):
@@ -1189,7 +1189,7 @@ def test_get_diagram_multiple_blocks(self):
pipeline.blocks['b_primitive#1'].produce_args = first_output
pipeline.blocks['b_primitive#1'].produce_output = second_output
- assert str(pipeline.get_diagram()) == expected
+ assert str(pipeline.get_diagram()).strip() == expected.strip()
def test_fit(self):
pass
From 40c5c413dc62cd1e38b6fa8e40fc858b6ac54479 Mon Sep 17 00:00:00 2001
From: sarahmish
Date: Sun, 22 Jan 2023 16:25:44 -0500
Subject: [PATCH 141/160] add release notes
---
HISTORY.md | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/HISTORY.md b/HISTORY.md
index 0575c034..c183b575 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,11 @@
Changelog
=========
+0.5.0 - 2023-01-22
+------------------
+
+* Update `numpy` dependency and isolate tests - [Issue #139](https://github.com/MLBazaar/MLBlocks/issues/139) by @sarahmish
+
0.4.1 - 2021-10-08
------------------
From a4ba9c4e588d88b95797117e2562100bb76e6def Mon Sep 17 00:00:00 2001
From: sarahmish
Date: Sun, 22 Jan 2023 16:26:03 -0500
Subject: [PATCH 142/160] =?UTF-8?q?Bump=20version:=200.4.2.dev0=20?=
=?UTF-8?q?=E2=86=92=200.5.0.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 9c42ed1a..82a61ca3 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.4.2.dev0'
+__version__ = '0.5.0.dev0'
__all__ = [
'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index fc9e4e12..d21c7a1a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.4.2.dev0
+current_version = 0.5.0.dev0
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index 85b05bcd..4926b10a 100644
--- a/setup.py
+++ b/setup.py
@@ -118,6 +118,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/MLBazaar/MLBlocks',
- version='0.4.2.dev0',
+ version='0.5.0.dev0',
zip_safe=False,
)
From 8140e3dcfe017e2a1e04ada9c6783f2dcdf30198 Mon Sep 17 00:00:00 2001
From: sarahmish
Date: Sun, 22 Jan 2023 16:26:03 -0500
Subject: [PATCH 143/160] =?UTF-8?q?Bump=20version:=200.5.0.dev0=20?=
=?UTF-8?q?=E2=86=92=200.5.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 82a61ca3..7cc2da30 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.5.0.dev0'
+__version__ = '0.5.0'
__all__ = [
'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index d21c7a1a..746b4d2f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.5.0.dev0
+current_version = 0.5.0
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index 4926b10a..8b11e6ff 100644
--- a/setup.py
+++ b/setup.py
@@ -118,6 +118,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/MLBazaar/MLBlocks',
- version='0.5.0.dev0',
+ version='0.5.0',
zip_safe=False,
)
From a70b30713416ca1bc1a4cf2c2675cda383e28ca8 Mon Sep 17 00:00:00 2001
From: sarahmish
Date: Sun, 22 Jan 2023 16:26:18 -0500
Subject: [PATCH 144/160] =?UTF-8?q?Bump=20version:=200.5.0=20=E2=86=92=200?=
=?UTF-8?q?.5.1.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 7cc2da30..3e7aa671 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.5.0'
+__version__ = '0.5.1.dev0'
__all__ = [
'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 746b4d2f..70204a8c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.5.0
+current_version = 0.5.1.dev0
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index 8b11e6ff..70d599ea 100644
--- a/setup.py
+++ b/setup.py
@@ -118,6 +118,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/MLBazaar/MLBlocks',
- version='0.5.0',
+ version='0.5.1.dev0',
zip_safe=False,
)
From c74137e6a52c141d2bc10bb8b11de5b72e83ea07 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com>
Date: Thu, 13 Apr 2023 14:17:52 -0400
Subject: [PATCH 145/160] Upgrade python (#142)
* add python 3.9 and 3.10
* fix python specification
* update python version in readme
---
.github/workflows/tests.yml | 2 +-
Makefile | 4 ++++
README.md | 2 +-
setup.py | 4 +++-
tox.ini | 4 +++-
5 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 4cb525ed..3f46f728 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -45,7 +45,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
- python-version: [3.6, 3.7, 3.8]
+ python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
os: [ubuntu-20.04, macos-latest]
steps:
- uses: actions/checkout@v1
diff --git a/Makefile b/Makefile
index 2ae6c7c3..4fa8cc04 100644
--- a/Makefile
+++ b/Makefile
@@ -76,6 +76,10 @@ install: clean-build clean-pyc ## install the package to the active Python's sit
install-examples: clean-build clean-pyc ## install the package and the examples dependencies
pip install .[examples]
+.PHONY: install-unit
+install-unit: clean-build clean-pyc ## install the package and dependencies for unit tests
+ pip install .[unit]
+
.PHONY: install-test
install-test: clean-build clean-pyc ## install the package and test dependencies
pip install .[test]
diff --git a/README.md b/README.md
index 4da013b0..13c23c3a 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ Features include:
## Requirements
-**MLBlocks** has been developed and tested on [Python 3.6, 3.7 and 3.8](https://www.python.org/downloads/)
+**MLBlocks** has been developed and tested on [Python 3.6, 3.7, 3.8, 3.9, and 3.10](https://www.python.org/downloads/)
## Install with `pip`
diff --git a/setup.py b/setup.py
index 70d599ea..17159dbb 100644
--- a/setup.py
+++ b/setup.py
@@ -96,6 +96,8 @@
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
+ 'Programming Language :: Python :: 3.9',
+ 'Programming Language :: Python :: 3.10',
],
description='Pipelines and primitives for machine learning and data science.',
extras_require={
@@ -113,7 +115,7 @@
long_description_content_type='text/markdown',
name='mlblocks',
packages=find_packages(include=['mlblocks', 'mlblocks.*']),
- python_requires='>=3.6,<3.9',
+ python_requires='>=3.6,<3.11',
setup_requires=setup_requires,
test_suite='tests',
tests_require=tests_require,
diff --git a/tox.ini b/tox.ini
index e38f071b..a589526a 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,8 +1,10 @@
[tox]
-envlist = py3{6,7,8}, test-devel
+envlist = py3{6,7,8,9,10}, test-devel
[travis]
python =
+ 3.10: py10
+ 3.9: py39
3.8: py38, test-devel
3.7: py37
3.6: py36
From b85983d956699c5863e153816543fb6f29bdb8ff Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish
Date: Fri, 14 Apr 2023 14:28:08 -0400
Subject: [PATCH 146/160] =?UTF-8?q?Bump=20version:=200.5.1.dev0=20?=
=?UTF-8?q?=E2=86=92=200.5.1.dev1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 3e7aa671..3b880bb8 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.5.1.dev0'
+__version__ = '0.5.1.dev1'
__all__ = [
'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 70204a8c..40f0d06a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.5.1.dev0
+current_version = 0.5.1.dev1
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index 17159dbb..9ab20327 100644
--- a/setup.py
+++ b/setup.py
@@ -120,6 +120,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/MLBazaar/MLBlocks',
- version='0.5.1.dev0',
+ version='0.5.1.dev1',
zip_safe=False,
)
From 6597bfa501bc341e27f48e2ca357a9b61a17a854 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish
Date: Fri, 14 Apr 2023 15:21:18 -0400
Subject: [PATCH 147/160] add release notes
---
HISTORY.md | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/HISTORY.md b/HISTORY.md
index c183b575..f1c4209f 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,11 @@
Changelog
=========
+0.6.0 - 2023-04-14
+------------------
+
+* Support python 3.9 and 3.10 - [Issue #141](https://github.com/MLBazaar/MLBlocks/issues/141) by @sarahmish
+
0.5.0 - 2023-01-22
------------------
From 1cc2551142cc21165a09f52063545b3edd02fed7 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish
Date: Fri, 14 Apr 2023 15:21:31 -0400
Subject: [PATCH 148/160] =?UTF-8?q?Bump=20version:=200.5.1.dev1=20?=
=?UTF-8?q?=E2=86=92=200.6.0.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 3b880bb8..344fd4b2 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.5.1.dev1'
+__version__ = '0.6.0.dev0'
__all__ = [
'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 40f0d06a..4637a833 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.5.1.dev1
+current_version = 0.6.0.dev0
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index 9ab20327..80137119 100644
--- a/setup.py
+++ b/setup.py
@@ -120,6 +120,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/MLBazaar/MLBlocks',
- version='0.5.1.dev1',
+ version='0.6.0.dev0',
zip_safe=False,
)
From f934db0d36f4d4965707092209fcafdba74dc330 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish
Date: Fri, 14 Apr 2023 15:21:31 -0400
Subject: [PATCH 149/160] =?UTF-8?q?Bump=20version:=200.6.0.dev0=20?=
=?UTF-8?q?=E2=86=92=200.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 344fd4b2..650b26ca 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.6.0.dev0'
+__version__ = '0.6.0'
__all__ = [
'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 4637a833..2800a7f1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.6.0.dev0
+current_version = 0.6.0
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index 80137119..fd8791a8 100644
--- a/setup.py
+++ b/setup.py
@@ -120,6 +120,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/MLBazaar/MLBlocks',
- version='0.6.0.dev0',
+ version='0.6.0',
zip_safe=False,
)
From ec8433590f8e928484f49ea0a76543caf7f117b5 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish
Date: Fri, 14 Apr 2023 15:21:51 -0400
Subject: [PATCH 150/160] =?UTF-8?q?Bump=20version:=200.6.0=20=E2=86=92=200?=
=?UTF-8?q?.6.1.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 650b26ca..021d9734 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.6.0'
+__version__ = '0.6.1.dev0'
__all__ = [
'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 2800a7f1..40e7b099 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.6.0
+current_version = 0.6.1.dev0
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index fd8791a8..c9658a63 100644
--- a/setup.py
+++ b/setup.py
@@ -120,6 +120,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/MLBazaar/MLBlocks',
- version='0.6.0',
+ version='0.6.1.dev0',
zip_safe=False,
)
From 21f0df503609fe256ca9711b98fd92f4b83a522e Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com>
Date: Tue, 26 Sep 2023 10:43:52 -0400
Subject: [PATCH 151/160] Add python 3.11 to MLBlocks (#143)
* test python 3.11
* pin lightfm
* update pip
* fix syntax
* add wheel
* fix data loading
* fix readme example
* remove data
---
.github/workflows/tests.yml | 18 ++++++-
README.md | 12 +++--
docs/getting_started/quickstart.rst | 14 +++--
.../tutorials/1. Using and MLPipeline.ipynb | 23 +++++---
.... Setting MLPipeline Hyperparameters.ipynb | 24 +++++++--
.../4. Saving and Loading a Pipeline.ipynb | 19 +++++--
...ial execution and pipeline debugging.ipynb | 19 +++++--
.../6. Flexible outputs specification.ipynb | 30 ++++++++---
examples/tutorials/7. Tuning a Pipeline.ipynb | 4 +-
...or the best pipeline with BTBSession.ipynb | 20 +++----
examples/tutorials/utils.py | 52 +++++++++++++++++++
setup.py | 3 +-
tox.ini | 3 +-
13 files changed, 187 insertions(+), 54 deletions(-)
create mode 100644 examples/tutorials/utils.py
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 3f46f728..0eb00220 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -19,6 +19,10 @@ jobs:
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
+ - name: Upgrade pip
+ run: pip install -U pip setuptools wheel
+ - name: Install lightfm
+ run: python -m pip install --no-use-pep517 'lightfm<2'
- name: Install package
run: pip install .[dev]
- name: make test-devel
@@ -36,6 +40,10 @@ jobs:
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
+ - name: Upgrade pip
+ run: pip install -U pip setuptools wheel
+ - name: Install lightfm
+ run: python -m pip install --no-use-pep517 'lightfm<2'
- name: Install package and dependencies
run: pip install rundoc .[mlprimitives]
- name: make test-readme
@@ -45,7 +53,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
- python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
+ python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11']
os: [ubuntu-20.04, macos-latest]
steps:
- uses: actions/checkout@v1
@@ -70,6 +78,10 @@ jobs:
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
+ - name: Upgrade pip
+ run: pip install -U pip setuptools wheel
+ - name: Install lightfm
+ run: python -m pip install --no-use-pep517 'lightfm<2'
- name: Install package and dependencies
run: pip install .[test]
- name: make test-mlprimitives
@@ -90,6 +102,10 @@ jobs:
- if: matrix.os == 'ubuntu-20.04'
name: Install dependencies - Ubuntu
run: sudo apt-get install graphviz
+ - name: Upgrade pip
+ run: pip install -U pip setuptools wheel
+ - name: Install lightfm
+ run: python -m pip install --no-use-pep517 'lightfm<2'
- name: Install package and dependencies
run: pip install .[examples]
- name: make test-tutorials
diff --git a/README.md b/README.md
index 13c23c3a..662a3ed3 100644
--- a/README.md
+++ b/README.md
@@ -86,11 +86,15 @@ pipeline which combines primitives from [MLPrimitives](https://github.com/MLBaza
[scikit-learn](https://scikit-learn.org/) and [xgboost](https://xgboost.readthedocs.io/).
```python3
+import pandas as pd
from mlblocks import MLPipeline
-from mlprimitives.datasets import load_dataset
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
-dataset = load_dataset('census')
-X_train, X_test, y_train, y_test = dataset.get_splits(1)
+dataset = pd.read_csv('http://mlblocks.s3.amazonaws.com/census.csv')
+label = dataset.pop('label')
+
+X_train, X_test, y_train, y_test = train_test_split(dataset, label, stratify=label)
primitives = [
'mlprimitives.custom.preprocessing.ClassEncoder',
@@ -104,7 +108,7 @@ pipeline = MLPipeline(primitives)
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
-dataset.score(y_test, predictions)
+accuracy_score(y_test, predictions)
```
# What's Next?
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
index f0cb9a3f..55c20d86 100644
--- a/docs/getting_started/quickstart.rst
+++ b/docs/getting_started/quickstart.rst
@@ -104,9 +104,13 @@ labels.
.. ipython:: python
:okwarning:
- from mlprimitives.datasets import load_census
- dataset = load_census()
- X_train, X_test, y_train, y_test = dataset.get_splits(1)
+ import pandas as pd
+ from sklearn.model_selection import train_test_split
+
+ dataset = pd.read_csv('http://mlblocks.s3.amazonaws.com/census.csv')
+ label = dataset.pop('label')
+
+ X_train, X_test, y_train, y_test = train_test_split(dataset, label, stratify=label)
pipeline.fit(X_train, y_train)
Once we have fitted our model to our data, we can call the ``predict`` method passing new data
@@ -115,9 +119,11 @@ to obtain predictions from the pipeline.
.. ipython:: python
:okwarning:
+ from sklearn.metrics import accuracy_score
+
predictions = pipeline.predict(X_test)
predictions
- dataset.score(y_test, predictions)
+ accuracy_score(y_test, predictions)
.. _you have already installed them: install.html#additional-dependencies
.. _MLPipeline class: ../api_reference.html#mlblocks.MLPipeline
diff --git a/examples/tutorials/1. Using and MLPipeline.ipynb b/examples/tutorials/1. Using and MLPipeline.ipynb
index dab130ea..901cc50b 100644
--- a/examples/tutorials/1. Using and MLPipeline.ipynb
+++ b/examples/tutorials/1. Using and MLPipeline.ipynb
@@ -33,9 +33,9 @@
"metadata": {},
"outputs": [],
"source": [
- "from mlprimitives.datasets import load_dataset\n",
+ "from utils import load_census\n",
"\n",
- "dataset = load_dataset('census')"
+ "dataset = load_census()"
]
},
{
@@ -528,7 +528,16 @@
"cell_type": "code",
"execution_count": 13,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
"source": [
"pipeline.fit(X_train, y_train)"
]
@@ -546,9 +555,7 @@
{
"cell_type": "code",
"execution_count": 14,
- "metadata": {
- "scrolled": false
- },
+ "metadata": {},
"outputs": [],
"source": [
"predictions = pipeline.predict(X_test)"
@@ -611,7 +618,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -625,7 +632,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.9"
+ "version": "3.8.16"
}
},
"nbformat": 4,
diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
index 4993fd4e..7aa0ab2b 100644
--- a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
+++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
@@ -37,9 +37,9 @@
"metadata": {},
"outputs": [],
"source": [
- "from mlprimitives.datasets import load_dataset\n",
+ "from utils import load_census\n",
"\n",
- "dataset = load_dataset('census')\n",
+ "dataset = load_census()\n",
"X_train, X_test, y_train, y_test = dataset.get_splits(1)"
]
},
@@ -268,6 +268,14 @@
"execution_count": 7,
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
+ " warnings.warn(\n"
+ ]
+ },
{
"data": {
"text/plain": [
@@ -394,6 +402,14 @@
"execution_count": 11,
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
+ " warnings.warn(\n"
+ ]
+ },
{
"data": {
"text/plain": [
@@ -415,7 +431,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -429,7 +445,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.9"
+ "version": "3.8.16"
}
},
"nbformat": 4,
diff --git a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb
index 01a58cd5..ec1c6f97 100644
--- a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb
+++ b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb
@@ -35,9 +35,9 @@
"metadata": {},
"outputs": [],
"source": [
- "from mlprimitives.datasets import load_dataset\n",
+ "from utils import load_census\n",
"\n",
- "dataset = load_dataset('census')"
+ "dataset = load_census()"
]
},
{
@@ -71,7 +71,16 @@
"cell_type": "code",
"execution_count": 4,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
"source": [
"pipeline.fit(X_train, y_train)"
]
@@ -166,7 +175,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -180,7 +189,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.9"
+ "version": "3.8.16"
}
},
"nbformat": 4,
diff --git a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb
index 57b2b43c..769a69c1 100644
--- a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb
+++ b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb
@@ -36,9 +36,9 @@
"metadata": {},
"outputs": [],
"source": [
- "from mlprimitives.datasets import load_dataset\n",
+ "from utils import load_census\n",
"\n",
- "dataset = load_dataset('census')"
+ "dataset = load_census()"
]
},
{
@@ -430,7 +430,16 @@
"cell_type": "code",
"execution_count": 11,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
"source": [
"fit_context = pipeline.fit(start_=1, output_=2, **fit_context)"
]
@@ -690,7 +699,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -704,7 +713,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.9"
+ "version": "3.8.16"
}
},
"nbformat": 4,
diff --git a/examples/tutorials/6. Flexible outputs specification.ipynb b/examples/tutorials/6. Flexible outputs specification.ipynb
index ca1048dd..6ecad5a5 100644
--- a/examples/tutorials/6. Flexible outputs specification.ipynb
+++ b/examples/tutorials/6. Flexible outputs specification.ipynb
@@ -37,9 +37,9 @@
"metadata": {},
"outputs": [],
"source": [
- "from mlprimitives.datasets import load_dataset\n",
+ "from utils import load_census\n",
"\n",
- "dataset = load_dataset('census')"
+ "dataset = load_census()"
]
},
{
@@ -420,7 +420,16 @@
"cell_type": "code",
"execution_count": 10,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
"source": [
"output_spec = [\n",
" 'sklearn.impute.SimpleImputer#1.X',\n",
@@ -441,7 +450,16 @@
"cell_type": "code",
"execution_count": 11,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
"source": [
"output_spec = [\n",
" 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1.X',\n",
@@ -495,7 +513,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -509,7 +527,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.9"
+ "version": "3.8.16"
}
},
"nbformat": 4,
diff --git a/examples/tutorials/7. Tuning a Pipeline.ipynb b/examples/tutorials/7. Tuning a Pipeline.ipynb
index ca30df17..7a288a46 100644
--- a/examples/tutorials/7. Tuning a Pipeline.ipynb
+++ b/examples/tutorials/7. Tuning a Pipeline.ipynb
@@ -34,9 +34,9 @@
"metadata": {},
"outputs": [],
"source": [
- "from mlprimitives.datasets import load_dataset\n",
+ "from utils import load_census\n",
"\n",
- "dataset = load_dataset('census')"
+ "dataset = load_census()"
]
},
{
diff --git a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
index 44431d4f..80ad93fb 100644
--- a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
+++ b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
@@ -37,9 +37,9 @@
"metadata": {},
"outputs": [],
"source": [
- "from mlprimitives.datasets import load_dataset\n",
+ "from utils import load_census\n",
"\n",
- "dataset = load_dataset('census')"
+ "dataset = load_census()"
]
},
{
@@ -309,9 +309,7 @@
{
"cell_type": "code",
"execution_count": 11,
- "metadata": {
- "scrolled": false
- },
+ "metadata": {},
"outputs": [
{
"data": {
@@ -536,9 +534,7 @@
{
"cell_type": "code",
"execution_count": 13,
- "metadata": {
- "scrolled": false
- },
+ "metadata": {},
"outputs": [
{
"data": {
@@ -707,9 +703,7 @@
{
"cell_type": "code",
"execution_count": 16,
- "metadata": {
- "scrolled": false
- },
+ "metadata": {},
"outputs": [
{
"data": {
@@ -772,7 +766,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -786,7 +780,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.9"
+ "version": "3.8.16"
}
},
"nbformat": 4,
diff --git a/examples/tutorials/utils.py b/examples/tutorials/utils.py
new file mode 100644
index 00000000..32b210a7
--- /dev/null
+++ b/examples/tutorials/utils.py
@@ -0,0 +1,52 @@
+import io
+import os
+
+import pandas as pd
+from sklearn.metrics import accuracy_score
+from mlprimitives.datasets import Dataset
+
+DATA_PATH = os.path.join(
+ os.path.dirname(__file__),
+ 'data'
+)
+
+DATA_URL = 'http://mlblocks.s3.amazonaws.com/%7B%7D.csv'
+
+def _download(dataset_name, dataset_path):
+ url = DATA_URL.format(dataset_name)
+
+ data = pd.read_csv(url)
+ data.to_csv(dataset_path, index=False)
+
+def _load(dataset_name):
+ if not os.path.exists(DATA_PATH):
+ os.makedirs(DATA_PATH)
+
+ dataset_path = os.path.join(DATA_PATH, dataset_name + '.csv')
+ if not os.path.exists(dataset_path):
+ _download(dataset_name, dataset_path)
+
+ return dataset_path
+
+def load_census():
+ """Adult Census dataset.
+
+ Predict whether income exceeds $50K/yr based on census data. Also known as "Adult" dataset.
+
+ Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean
+ records was extracted using the following conditions: ((AAGE>16) && (AGI>100) &&
+ (AFNLWGT>1)&& (HRSWK>0))
+
+ Prediction task is to determine whether a person makes over 50K a year.
+
+ source: "UCI
+ sourceURI: "https://archive.ics.uci.edu/ml/datasets/census+income"
+ """
+
+ dataset_path = _load('census_train')
+
+ X = pd.read_csv(dataset_path)
+ y = X.pop('label').values
+
+ return Dataset(load_census.__doc__, X, y, accuracy_score, 'single_table',
+ 'classification', 'binary', stratify=True)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index c9658a63..3df32765 100644
--- a/setup.py
+++ b/setup.py
@@ -98,6 +98,7 @@
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
+ 'Programming Language :: Python :: 3.11',
],
description='Pipelines and primitives for machine learning and data science.',
extras_require={
@@ -115,7 +116,7 @@
long_description_content_type='text/markdown',
name='mlblocks',
packages=find_packages(include=['mlblocks', 'mlblocks.*']),
- python_requires='>=3.6,<3.11',
+ python_requires='>=3.6,<3.12',
setup_requires=setup_requires,
test_suite='tests',
tests_require=tests_require,
diff --git a/tox.ini b/tox.ini
index a589526a..27e499ed 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,8 +1,9 @@
[tox]
-envlist = py3{6,7,8,9,10}, test-devel
+envlist = py3{6,7,8,9,10,11}, test-devel
[travis]
python =
+ 3.11: py11
3.10: py10
3.9: py39
3.8: py38, test-devel
From d401d1026dec4c60a4daed19d97daee58f5b573c Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish
Date: Tue, 26 Sep 2023 11:25:02 -0400
Subject: [PATCH 152/160] =?UTF-8?q?Bump=20version:=200.6.1.dev0=20?=
=?UTF-8?q?=E2=86=92=200.6.1.dev1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
mlblocks/__init__.py | 2 +-
setup.cfg | 2 +-
setup.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 021d9734..86777d40 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.6.1.dev0'
+__version__ = '0.6.1.dev1'
__all__ = [
'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 40e7b099..33532996 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.6.1.dev0
+current_version = 0.6.1.dev1
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P