From 396114b6140a12ead0d52993c8e42d61d2e4dd13 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 7 Jan 2019 20:13:41 +0100 Subject: [PATCH 001/160] Discover primitives using entry_points --- mlblocks/primitives.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py index 337116e7..8902b672 100644 --- a/mlblocks/primitives.py +++ b/mlblocks/primitives.py @@ -9,11 +9,15 @@ import json import os +import pkg_resources import sys +_PRIMITIVES_FOLDER_NAME = 'mlprimitives' +_OLD_PRIMITIVES_FOLDER_NAME = 'mlblocks_primitives' _PRIMITIVES_PATHS = [ - os.path.join(os.getcwd(), 'mlblocks_primitives'), - os.path.join(sys.prefix, 'mlblocks_primitives'), + os.path.join(os.getcwd(), _PRIMITIVES_FOLDER_NAME), + os.path.join(os.getcwd(), _OLD_PRIMITIVES_FOLDER_NAME), # legacy + os.path.join(sys.prefix, _OLD_PRIMITIVES_FOLDER_NAME), # legacy ] @@ -45,7 +49,13 @@ def get_primitives_paths(): list: The list of folders. """ - return _PRIMITIVES_PATHS + + primitives_paths = list() + for entry_point in pkg_resources.iter_entry_points(_PRIMITIVES_FOLDER_NAME): + path = pkg_resources.resource_filename(entry_point.name, entry_point.module_name) + primitives_paths.append(path) + + return _PRIMITIVES_PATHS + primitives_paths def load_primitive(name): @@ -69,10 +79,17 @@ def load_primitive(name): found. """ - for base_path in _PRIMITIVES_PATHS: - json_path = os.path.join(base_path, name + '.json') - if os.path.isfile(json_path): - with open(json_path, 'r') as json_file: - return json.load(json_file) + for base_path in get_primitives_paths(): + parts = name.split('.') + number_of_parts = len(parts) + + for folder_parts in range(number_of_parts): + folder = os.path.join(base_path, *parts[:folder_parts]) + filename = '.'.join(parts[folder_parts:]) + '.json' + json_path = os.path.join(folder, filename) + + if os.path.isfile(json_path): + with open(json_path, 'r') as json_file: + return json.load(json_file) raise ValueError("Unknown primitive: {}".format(name)) From da04277d5268b194bd33707735a14b79d1cf1239 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 8 Jan 2019 14:36:27 +0100 Subject: [PATCH 002/160] Fix import order and add tests --- mlblocks/primitives.py | 6 ++++-- tests/test_primitives.py | 30 +++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py index 8902b672..d4825bf6 100644 --- a/mlblocks/primitives.py +++ b/mlblocks/primitives.py @@ -9,9 +9,10 @@ import json import os -import pkg_resources import sys +import pkg_resources + _PRIMITIVES_FOLDER_NAME = 'mlprimitives' _OLD_PRIMITIVES_FOLDER_NAME = 'mlblocks_primitives' _PRIMITIVES_PATHS = [ @@ -52,7 +53,8 @@ def get_primitives_paths(): primitives_paths = list() for entry_point in pkg_resources.iter_entry_points(_PRIMITIVES_FOLDER_NAME): - path = pkg_resources.resource_filename(entry_point.name, entry_point.module_name) + module_path = os.path.join(*entry_point.module_name.split('.')) + path = pkg_resources.resource_filename(entry_point.name, module_path) primitives_paths.append(path) return _PRIMITIVES_PATHS + primitives_paths diff --git a/tests/test_primitives.py b/tests/test_primitives.py index 65906406..990c4da5 100644 --- a/tests/test_primitives.py +++ b/tests/test_primitives.py @@ -7,6 +7,7 @@ from unittest.mock import patch import pytest +from pkg_resources import EntryPoint from mlblocks import primitives @@ -36,12 +37,39 @@ def test_add_primitives_path(): @patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) -def test_get_primitives_paths(): +@patch('mlblocks.primitives._PRIMITIVES_FOLDER_NAME', new='fake_name') +def test_get_primitives_paths_no_entry_points(): paths = primitives.get_primitives_paths() assert paths == ['a', 'b'] +@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) +@patch('mlblocks.primitives.pkg_resources.iter_entry_points') +def test_get_primitives_paths_entry_points(iep_mock): + # setup + iep_mock.return_value = [ + EntryPoint('mlblocks', 'primitives.jsons') + ] + + # run + paths = primitives.get_primitives_paths() + + # assert + expected = [ + 'a', + 'b', + os.path.join( + os.path.dirname(primitives.__file__), + 'primitives', + 'jsons' + ) + ] + assert paths == expected + + iep_mock.assert_called_once_with('mlprimitives') + + @patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) def test_load_primitive_value_error(): with pytest.raises(ValueError): From f551d339217554472fb5cecc162d5ab31f0d10d6 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 8 Jan 2019 19:02:43 +0100 Subject: [PATCH 003/160] Change slightly the way the entry points are used and add docs --- .gitignore | 1 + docs/advanced_usage/adding_primitives.rst | 34 ++++++++- docs/api/mlblocks.primitives.rst | 5 ++ docs/index.rst | 1 + docs/pipeline.json | 91 ----------------------- mlblocks/primitives.py | 33 +++++--- tests/__init__.py | 0 tests/test_primitives.py | 30 +++++--- 8 files changed, 83 insertions(+), 112 deletions(-) create mode 100644 docs/api/mlblocks.primitives.rst delete mode 100644 docs/pipeline.json create mode 100644 tests/__init__.py diff --git a/.gitignore b/.gitignore index cbc1f8c1..011ff452 100644 --- a/.gitignore +++ b/.gitignore @@ -64,6 +64,7 @@ instance/ # Sphinx documentation docs/_build/ +docs/pipeline.json # PyBuilder target/ diff --git a/docs/advanced_usage/adding_primitives.rst b/docs/advanced_usage/adding_primitives.rst index fc2e81b9..e3d4b964 100644 --- a/docs/advanced_usage/adding_primitives.rst +++ b/docs/advanced_usage/adding_primitives.rst @@ -29,7 +29,7 @@ by writing the corresponding `JSON annotation .. _MLPrimitives integrated primitives: https://github.com/HDI-Project/MLPrimitives/tree/master/mlblocks_primitives -.. note:: If you integrate new primitives for MLBlocks, please consider contributing them to the +.. note:: If you create new primitives for MLBlocks, please consider contributing them to the **MLPrimitives** project! The first thing to do when adding a new primitive is making sure that it complies with the @@ -58,8 +58,8 @@ place known to **MLBlocks**. **MLBlocks** looks for primitives in the following folders, in this order: 1. Any folder specified by the user, starting by the latest one. -2. A folder named `mlblocks_primitives` in the current working directory. -3. A folder named `mlblocks_primitives` in the `system prefix`_. +2. A folder named ``mlblocks_primitives`` or ``mlprimitives`` in the current working directory. +3. A folder named ``mlblocks_primitives`` or ``mlprimitives`` in the `system prefix`_. .. _system prefix: https://docs.python.org/3/library/sys.html#sys.prefix @@ -80,3 +80,31 @@ However, sometimes you will want to add a custom directory. This can be easily done by using the `mlblocks.add_primitives_path`_ method. .. _mlblocks.add_primitives_path: ../api_reference.html#mlblocks.add_primitives_path + +Developing a Primitives Library +------------------------------- + +Another option to add multiple libraries is creating a primitives library, such as +`MLPrimitives`_. + +In order to make **MLBLocks** able to find the primitives defined in such a library, +all you need to do is setting up an `Entry Point`_ in your `setup.py` script with the +following specification: + +1. It has to be published under the name ``mlprimitives``. +2. It has to be named exactly ``jsons_path``. +3. It has to point at a variable that contains the path to the JSONS folder. + +An example of such an entry point would be:: + + entry_points = { + 'mlprimitives': [ + 'jsons_path=some_module:SOME_VARIABLE' + ] + } + +where the module `some_module` contains a variable such as:: + + SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') + +.. _Entry Point: https://packaging.python.org/specifications/entry-points/ diff --git a/docs/api/mlblocks.primitives.rst b/docs/api/mlblocks.primitives.rst new file mode 100644 index 00000000..d625c774 --- /dev/null +++ b/docs/api/mlblocks.primitives.rst @@ -0,0 +1,5 @@ +mlblocks.primitives +=================== + +.. automodule:: mlblocks.primitives + :members: diff --git a/docs/index.rst b/docs/index.rst index 28a3f0bb..2bb4c5a9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -74,6 +74,7 @@ integrate with deep learning libraries. api/mlblocks api/mlblocks.datasets + api/mlblocks.primitives .. toctree:: :caption: Resources diff --git a/docs/pipeline.json b/docs/pipeline.json deleted file mode 100644 index c09d763c..00000000 --- a/docs/pipeline.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "primitives": [ - "sklearn.preprocessing.StandardScaler", - "sklearn.ensemble.RandomForestClassifier" - ], - "init_params": { - "sklearn.preprocessing.StandardScaler": { - "with_mean": false - }, - "sklearn.ensemble.RandomForestClassifier": { - "n_estimators": 100 - } - }, - "input_names": {}, - "output_names": {}, - "hyperparameters": { - "sklearn.preprocessing.StandardScaler#1": { - "with_mean": false, - "with_std": true - }, - "sklearn.ensemble.RandomForestClassifier#1": { - "n_jobs": -1, - "n_estimators": 100, - "criterion": "entropy", - "max_features": null, - "max_depth": 10, - "min_samples_split": 0.1, - "min_samples_leaf": 0.1, - "class_weight": null - } - }, - "tunable_hyperparameters": { - "sklearn.preprocessing.StandardScaler#1": { - "with_std": { - "type": "bool", - "default": true - } - }, - "sklearn.ensemble.RandomForestClassifier#1": { - "criterion": { - "type": "str", - "default": "entropy", - "values": [ - "entropy", - "gini" - ] - }, - "max_features": { - "type": "str", - "default": null, - "range": [ - null, - "auto", - "log2" - ] - }, - "max_depth": { - "type": "int", - "default": 10, - "range": [ - 1, - 30 - ] - }, - "min_samples_split": { - "type": "float", - "default": 0.1, - "range": [ - 0.0001, - 0.5 - ] - }, - "min_samples_leaf": { - "type": "float", - "default": 0.1, - "range": [ - 0.0001, - 0.5 - ] - }, - "class_weight": { - "type": "str", - "default": null, - "range": [ - null, - "balanced" - ] - } - } - } -} \ No newline at end of file diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py index d4825bf6..8aaaa60f 100644 --- a/mlblocks/primitives.py +++ b/mlblocks/primitives.py @@ -13,12 +13,11 @@ import pkg_resources -_PRIMITIVES_FOLDER_NAME = 'mlprimitives' -_OLD_PRIMITIVES_FOLDER_NAME = 'mlblocks_primitives' _PRIMITIVES_PATHS = [ - os.path.join(os.getcwd(), _PRIMITIVES_FOLDER_NAME), - os.path.join(os.getcwd(), _OLD_PRIMITIVES_FOLDER_NAME), # legacy - os.path.join(sys.prefix, _OLD_PRIMITIVES_FOLDER_NAME), # legacy + os.path.join(os.getcwd(), 'mlprimitives'), + os.path.join(sys.prefix, 'mlprimitives'), + os.path.join(os.getcwd(), 'mlblocks_primitives'), # legacy + os.path.join(sys.prefix, 'mlblocks_primitives'), # legacy ] @@ -46,16 +45,32 @@ def add_primitives_path(path): def get_primitives_paths(): """Get the list of folders where the primitives will be looked for. + This list will include the value of any `entry_point` named `jsons_path` published under + the name `mlprimitives`. + + An example of such an entry point would be:: + + entry_points = { + 'mlprimitives': [ + 'jsons_path=some_module:SOME_VARIABLE' + ] + } + + where the module `some_module` contains a variable such as:: + + SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') + Returns: list: The list of folders. """ primitives_paths = list() - for entry_point in pkg_resources.iter_entry_points(_PRIMITIVES_FOLDER_NAME): - module_path = os.path.join(*entry_point.module_name.split('.')) - path = pkg_resources.resource_filename(entry_point.name, module_path) - primitives_paths.append(path) + entry_points = pkg_resources.iter_entry_points('mlprimitives') + for entry_point in entry_points: + if entry_point.name == 'jsons_path': + path = entry_point.load() + primitives_paths.append(path) return _PRIMITIVES_PATHS + primitives_paths diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_primitives.py b/tests/test_primitives.py index 990c4da5..1afd17b6 100644 --- a/tests/test_primitives.py +++ b/tests/test_primitives.py @@ -7,10 +7,12 @@ from unittest.mock import patch import pytest -from pkg_resources import EntryPoint +from pkg_resources import Distribution, EntryPoint from mlblocks import primitives +FAKE_MLPRIMITIVES_PATH = 'this/is/a/fake' + @patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) def test_add_primitives_path_do_nothing(): @@ -37,19 +39,33 @@ def test_add_primitives_path(): @patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) -@patch('mlblocks.primitives._PRIMITIVES_FOLDER_NAME', new='fake_name') -def test_get_primitives_paths_no_entry_points(): +@patch('mlblocks.primitives.pkg_resources.iter_entry_points') +def test_get_primitives_paths_no_entry_points(iep_mock): + # setup + iep_mock.return_value == [] + + # run paths = primitives.get_primitives_paths() + # assert assert paths == ['a', 'b'] + iep_mock.assert_called_once_with('mlprimitives') @patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) @patch('mlblocks.primitives.pkg_resources.iter_entry_points') def test_get_primitives_paths_entry_points(iep_mock): # setup + something_else_ep = EntryPoint('something_else', 'mlblocks.__version__') + jsons_path_ep = EntryPoint( + 'jsons_path', + 'tests.test_primitives', + attrs=['FAKE_MLPRIMITIVES_PATH'], + dist=Distribution() + ) iep_mock.return_value = [ - EntryPoint('mlblocks', 'primitives.jsons') + something_else_ep, + jsons_path_ep ] # run @@ -59,11 +75,7 @@ def test_get_primitives_paths_entry_points(iep_mock): expected = [ 'a', 'b', - os.path.join( - os.path.dirname(primitives.__file__), - 'primitives', - 'jsons' - ) + 'this/is/a/fake' ] assert paths == expected From 74da0e2249cb30100229c64cd0a83a4543daf12e Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 9 Jan 2019 16:06:47 +0100 Subject: [PATCH 004/160] Add logging statements --- mlblocks/__init__.py | 6 +++--- mlblocks/datasets.py | 9 +++++++++ mlblocks/mlblock.py | 6 +++++- mlblocks/mlpipeline.py | 38 ++++++++++++++++++++++++-------------- mlblocks/primitives.py | 5 +++++ 5 files changed, 46 insertions(+), 18 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index cfc0ef6a..43079986 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -10,9 +10,9 @@ * Documentation: https://HDI-Project.github.io/MLBlocks """ -from mlblocks.mlblock import MLBlock # noqa -from mlblocks.mlpipeline import MLPipeline # noqa -from mlblocks.primitives import add_primitives_path, get_primitives_paths, load_primitive # noqa +from mlblocks.mlblock import MLBlock +from mlblocks.mlpipeline import MLPipeline +from mlblocks.primitives import add_primitives_path, get_primitives_paths, load_primitive __author__ = 'MIT Data To AI Lab' __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py index fba968e8..b5ed6b46 100644 --- a/mlblocks/datasets.py +++ b/mlblocks/datasets.py @@ -40,6 +40,7 @@ """ import io +import logging import os import tarfile import urllib @@ -52,6 +53,8 @@ from sklearn.metrics import accuracy_score, normalized_mutual_info_score, r2_score from sklearn.model_selection import KFold, StratifiedKFold, train_test_split +LOGGER = logging.getLogger(__name__) + INPUT_SHAPE = [224, 224, 3] DATA_PATH = os.path.join( @@ -183,9 +186,12 @@ def get_splits(self, n_splits=1): def _download(dataset_name, dataset_path): url = DATA_URL.format(dataset_name) + + LOGGER.debug('Downloading dataset %s from %s', dataset_name, url) response = urllib.request.urlopen(url) bytes_io = io.BytesIO(response.read()) + LOGGER.debug('Extracting dataset into %s', DATA_PATH) with tarfile.open(fileobj=bytes_io, mode='r:gz') as tf: tf.extractall(DATA_PATH) @@ -202,6 +208,7 @@ def _load(dataset_name): def _load_images(image_dir, filenames): + LOGGER.debug('Loading %s images from %s', len(filenames), image_dir) images = [] for filename in filenames: filename = os.path.join(image_dir, filename) @@ -217,6 +224,8 @@ def _load_images(image_dir, filenames): def _load_csv(dataset_path, name, set_index=False): csv_path = os.path.join(dataset_path, name + '.csv') + + LOGGER.debug('Loading csv %s', csv_path) df = pd.read_csv(csv_path) if set_index: diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index 9b6ec0d0..04a4bf55 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -3,9 +3,12 @@ """Package where the MLBlock class is defined.""" import importlib +import logging from mlblocks.primitives import load_primitive +LOGGER = logging.getLogger(__name__) + def import_object(object_name): """Import an object from its Fully Qualified Name.""" @@ -83,7 +86,7 @@ def _extract_params(self, kwargs, hyperparameters): value = param['default'] else: - raise TypeError("Required argument '{}' not found".format(name)) + raise TypeError("{} required argument '{}' not found".format(self.name, name)) init_params[name] = value @@ -193,6 +196,7 @@ def set_hyperparameters(self, hyperparameters): self._hyperparameters.update(hyperparameters) if self._class: + LOGGER.debug('Creating a new primitive instance for %s', self.name) self.instance = self.primitive(**self._hyperparameters) def fit(self, **kwargs): diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 4bad5d1f..058737ee 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -215,19 +215,25 @@ def fit(self, X=None, y=None, **kwargs): last_block_name = list(self.blocks.keys())[-1] for block_name, block in self.blocks.items(): - fit_args = self._get_block_args(block_name, block.fit_args, context) - LOGGER.debug("Fitting block %s", block_name) - block.fit(**fit_args) + try: + fit_args = self._get_block_args(block_name, block.fit_args, context) + block.fit(**fit_args) + except Exception: + LOGGER.exception("Exception caught fitting MLBlock %s", block_name) + raise if block_name != last_block_name: - produce_args = self._get_block_args(block_name, block.produce_args, context) - LOGGER.debug("Producing block %s", block_name) - outputs = block.produce(**produce_args) + try: + produce_args = self._get_block_args(block_name, block.produce_args, context) + outputs = block.produce(**produce_args) - output_dict = self._get_outputs(block_name, outputs, block.produce_output) - context.update(output_dict) + output_dict = self._get_outputs(block_name, outputs, block.produce_output) + context.update(output_dict) + except Exception: + LOGGER.exception("Exception caught producing MLBlock %s", block_name) + raise def predict(self, X=None, **kwargs): """Produce predictions using the blocks of this pipeline. @@ -252,14 +258,18 @@ def predict(self, X=None, **kwargs): last_block_name = list(self.blocks.keys())[-1] for block_name, block in self.blocks.items(): - produce_args = self._get_block_args(block_name, block.produce_args, context) - LOGGER.debug("Producing block %s", block_name) - outputs = block.produce(**produce_args) + try: + produce_args = self._get_block_args(block_name, block.produce_args, context) + outputs = block.produce(**produce_args) - if block_name != last_block_name: - output_dict = self._get_outputs(block_name, outputs, block.produce_output) - context.update(output_dict) + if block_name != last_block_name: + output_dict = self._get_outputs(block_name, outputs, block.produce_output) + context.update(output_dict) + + except Exception: + LOGGER.exception("Exception caught producing MLBlock %s", block_name) + raise return outputs diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py index 337116e7..c6e50790 100644 --- a/mlblocks/primitives.py +++ b/mlblocks/primitives.py @@ -8,9 +8,12 @@ """ import json +import logging import os import sys +LOGGER = logging.getLogger(__name__) + _PRIMITIVES_PATHS = [ os.path.join(os.getcwd(), 'mlblocks_primitives'), os.path.join(sys.prefix, 'mlblocks_primitives'), @@ -35,6 +38,7 @@ def add_primitives_path(path): if not os.path.isdir(path): raise ValueError('Invalid path: {}'.format(path)) + LOGGER.debug('Adding new primitives path %s', path) _PRIMITIVES_PATHS.insert(0, os.path.abspath(path)) @@ -73,6 +77,7 @@ def load_primitive(name): json_path = os.path.join(base_path, name + '.json') if os.path.isfile(json_path): with open(json_path, 'r') as json_file: + LOGGER.debug('Loading primitive %s from %s', name, json_path) return json.load(json_file) raise ValueError("Unknown primitive: {}".format(name)) From a2cf239fd22d5d8c0e50eabef11d65f5f2c65bbd Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 9 Jan 2019 18:04:33 +0100 Subject: [PATCH 005/160] Filter conditionals from tunable hyperparameters --- mlblocks/mlblock.py | 35 +++++-- tests/test_mlblock.py | 219 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 247 insertions(+), 7 deletions(-) diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index 04a4bf55..618ebc75 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -110,6 +110,33 @@ def _extract_params(self, kwargs, hyperparameters): return init_params, fit_params, produce_params + @staticmethod + def _filter_conditional(conditional, init_params): + condition = conditional['condition'] + if condition not in init_params: + return conditional + + condition_value = init_params[condition] + values = conditional['values'] + conditioned = values.get(condition_value) or values.get('*') + if conditioned: + return conditioned + + @classmethod + def _get_tunable(cls, hyperparameters, init_params): + tunable = dict() + for name, param in hyperparameters.get('tunable', dict()).items(): + if name not in init_params: + if param['type'] == 'conditional': + param = cls._filter_conditional(param, init_params) + if param is not None: + tunable[name] = param + + else: + tunable[name] = param + + return tunable + def __init__(self, name, **kwargs): self.name = name @@ -136,13 +163,7 @@ def __init__(self, name, **kwargs): self._fit_params = fit_params self._produce_params = produce_params - tunable = hyperparameters.get('tunable', dict()) - self._tunable = { - name: param - for name, param in tunable.items() - if name not in init_params - # TODO: filter conditionals - } + self._tunable = self._get_tunable(hyperparameters, init_params) default = { name: param['default'] diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py index abc235b0..970df5ed 100644 --- a/tests/test_mlblock.py +++ b/tests/test_mlblock.py @@ -23,6 +23,225 @@ class TestMLBlock(TestCase): def test__extract_params(self): pass + def test__get_tunable_no_conditionals(self): + """If there are no conditionals, tunables are returned unmodified.""" + + # setup + init_params = { + 'an_init_param': 'a_value' + } + hyperparameters = { + 'tunable': { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1 + } + } + } + + # run + tunable = MLBlock._get_tunable(hyperparameters, init_params) + + # assert + expected = { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1 + } + } + assert tunable == expected + + def test__get_tunable_no_condition(self): + """If there is a conditiona but no condition, conditional is returned unmodified.""" + + # setup + init_params = { + 'an_init_param': 'a_value' + } + hyperparameters = { + 'tunable': { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1 + }, + 'this_is_conditional': { + 'type': 'conditional', + 'condition': 'a_condition', + 'default': 1, + 'values': { + 1: { + 'type': 'int', + 'default': 0 + }, + '*': { + 'type': 'str', + 'default': 'whatever' + } + } + } + } + } + + # run + tunable = MLBlock._get_tunable(hyperparameters, init_params) + + # assert + expected = { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1 + }, + 'this_is_conditional': { + 'type': 'conditional', + 'condition': 'a_condition', + 'default': 1, + 'values': { + 1: { + 'type': 'int', + 'default': 0 + }, + '*': { + 'type': 'str', + 'default': 'whatever' + } + } + } + } + assert tunable == expected + + def test__get_tunable_condition_match(self): + """If there is a conditional and it matches, only that part is returned.""" + + # setup + init_params = { + 'a_condition': 'match' + } + hyperparameters = { + 'tunable': { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1 + }, + 'this_is_conditional': { + 'type': 'conditional', + 'condition': 'a_condition', + 'default': 1, + 'values': { + 'match': { + 'type': 'int', + 'default': 0 + }, + '*': { + 'type': 'str', + 'default': 'whatever' + } + } + } + } + } + + # run + tunable = MLBlock._get_tunable(hyperparameters, init_params) + + # assert + expected = { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1 + }, + 'this_is_conditional': { + 'type': 'int', + 'default': 0 + } + } + assert tunable == expected + + def test__get_tunable_condition_wildcard_match(self): + """If there is a conditional and it matches the wildcard, only that part is returned.""" + + # setup + init_params = { + 'a_condition': 'no_match' + } + hyperparameters = { + 'tunable': { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1 + }, + 'this_is_conditional': { + 'type': 'conditional', + 'condition': 'a_condition', + 'default': 1, + 'values': { + 'match': { + 'type': 'int', + 'default': 0 + }, + '*': { + 'type': 'str', + 'default': 'whatever' + } + } + } + } + } + + # run + tunable = MLBlock._get_tunable(hyperparameters, init_params) + + # assert + expected = { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1 + }, + 'this_is_conditional': { + 'type': 'str', + 'default': 'whatever' + } + } + assert tunable == expected + + def test__get_tunable_condition_no_match(self): + """If there is a conditional without match or wildcard, it is not returned.""" + + # setup + init_params = { + 'a_condition': 'no_match' + } + hyperparameters = { + 'tunable': { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1 + }, + 'this_is_conditional': { + 'type': 'conditional', + 'condition': 'a_condition', + 'default': 1, + 'values': { + 'match': { + 'type': 'int', + 'default': 0 + } + } + } + } + } + + # run + tunable = MLBlock._get_tunable(hyperparameters, init_params) + + # assert + expected = { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1 + } + } + assert tunable == expected + @patch('mlblocks.mlblock.MLBlock.set_hyperparameters') @patch('mlblocks.mlblock.import_object') @patch('mlblocks.mlblock.load_primitive') From 31b36d4779e8faeb38449025aec30b0b90c51378 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 10 Jan 2019 17:59:03 +0100 Subject: [PATCH 006/160] Changed slightly the behavior of the conditional hyperparameters. Also include docs --- docs/advanced_usage/hyperparameters.rst | 19 ++- mlblocks/mlblock.py | 8 +- tests/test_mlblock.py | 192 +++++++++++++++++------- 3 files changed, 156 insertions(+), 63 deletions(-) diff --git a/docs/advanced_usage/hyperparameters.rst b/docs/advanced_usage/hyperparameters.rst index bc31d4fd..71686ac5 100644 --- a/docs/advanced_usage/hyperparameters.rst +++ b/docs/advanced_usage/hyperparameters.rst @@ -165,6 +165,19 @@ Conditional Hyperparameters In some other cases, the values that a hyperparameter can take depend on the value of another one. +For example, sometimes a primitive has a hyperparameter that specifies a kernel, and depending +on the kernel used some other hyperparameters may be or not be used, or they might be able +to take only some specific values. + +In this case, the ``type`` of the hyperparameter whose values depend on the other is specified +as ``conditional``. +In this case, two additional entries are required: + +* an entry called ``condition``, which specifies the name of the other hyperparameter, the value + of which is evaluated to decide which values this hyperparameter can take. +* an additional subdictionary called ``values``, which relates the possible values that the + `condition` hyperparameter can have with the full specifications of the type and values that + this hyperparameter can take in each case. Suppose, for example, that the primitive explained in the previous point does not expect the ``mean``, ``min`` or ``max`` strings as values for the ``max_features`` hyperparameter, @@ -190,7 +203,7 @@ In this case, the hyperparameters would be annotated like this:: } "max_features_aggregation": { "type": "conditional", - "condition": "mas_features", + "condition": "max_features", "default": null, "values": { "auto": { @@ -202,6 +215,10 @@ In this case, the hyperparameters would be annotated like this:: } } +.. note:: Just like a regular hyperparameter, if there is no match the default entry is used. + In this example, the ``null`` value indicates that the hyperparameter needs to be + disabled if there is no match, but instead of it we could add there a full specification + of type, range and default value as a nested dictionary to be used by default. .. _JSON Annotations: primitives.html#json-annotations .. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index 618ebc75..a5cdb6a4 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -113,14 +113,14 @@ def _extract_params(self, kwargs, hyperparameters): @staticmethod def _filter_conditional(conditional, init_params): condition = conditional['condition'] + default = conditional.get('default') + if condition not in init_params: - return conditional + return default condition_value = init_params[condition] values = conditional['values'] - conditioned = values.get(condition_value) or values.get('*') - if conditioned: - return conditioned + return values.get(condition_value, default) @classmethod def _get_tunable(cls, hyperparameters, init_params): diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py index 970df5ed..5273d40c 100644 --- a/tests/test_mlblock.py +++ b/tests/test_mlblock.py @@ -34,7 +34,8 @@ def test__get_tunable_no_conditionals(self): 'tunable': { 'this_is_not_conditional': { 'type': 'int', - 'default': 1 + 'default': 1, + 'range': [1, 10] } } } @@ -46,13 +47,14 @@ def test__get_tunable_no_conditionals(self): expected = { 'this_is_not_conditional': { 'type': 'int', - 'default': 1 + 'default': 1, + 'range': [1, 10] } } assert tunable == expected def test__get_tunable_no_condition(self): - """If there is a conditiona but no condition, conditional is returned unmodified.""" + """If there is a conditional but no condition, the default is used.""" # setup init_params = { @@ -62,20 +64,27 @@ def test__get_tunable_no_condition(self): 'tunable': { 'this_is_not_conditional': { 'type': 'int', - 'default': 1 + 'default': 1, + 'range': [1, 10] }, 'this_is_conditional': { 'type': 'conditional', 'condition': 'a_condition', - 'default': 1, + 'default': { + 'type': 'float', + 'default': 0.1, + 'values': [0, 1] + }, 'values': { - 1: { - 'type': 'int', - 'default': 0 - }, - '*': { + 'not_a_match': { 'type': 'str', - 'default': 'whatever' + 'default': 'a', + 'values': ['a', 'b'] + }, + 'neither_a_match': { + 'type': 'int', + 'default': 0, + 'range': [1, 10] } } } @@ -89,22 +98,13 @@ def test__get_tunable_no_condition(self): expected = { 'this_is_not_conditional': { 'type': 'int', - 'default': 1 + 'default': 1, + 'range': [1, 10] }, 'this_is_conditional': { - 'type': 'conditional', - 'condition': 'a_condition', - 'default': 1, - 'values': { - 1: { - 'type': 'int', - 'default': 0 - }, - '*': { - 'type': 'str', - 'default': 'whatever' - } - } + 'type': 'float', + 'default': 0.1, + 'values': [0, 1] } } assert tunable == expected @@ -114,26 +114,33 @@ def test__get_tunable_condition_match(self): # setup init_params = { - 'a_condition': 'match' + 'a_condition': 'a_match' } hyperparameters = { 'tunable': { 'this_is_not_conditional': { 'type': 'int', - 'default': 1 + 'default': 1, + 'range': [1, 10] }, 'this_is_conditional': { 'type': 'conditional', 'condition': 'a_condition', - 'default': 1, + 'default': { + 'type': 'float', + 'default': 0.1, + 'values': [0, 1] + }, 'values': { - 'match': { - 'type': 'int', - 'default': 0 - }, - '*': { + 'not_a_match': { 'type': 'str', - 'default': 'whatever' + 'default': 'a', + 'values': ['a', 'b'] + }, + 'a_match': { + 'type': 'int', + 'default': 0, + 'range': [1, 10] } } } @@ -147,40 +154,49 @@ def test__get_tunable_condition_match(self): expected = { 'this_is_not_conditional': { 'type': 'int', - 'default': 1 + 'default': 1, + 'range': [1, 10] }, 'this_is_conditional': { 'type': 'int', - 'default': 0 + 'default': 0, + 'range': [1, 10] } } assert tunable == expected - def test__get_tunable_condition_wildcard_match(self): - """If there is a conditional and it matches the wildcard, only that part is returned.""" + def test__get_tunable_condition_no_match(self): + """If there is a conditional and it does not match, the default is used.""" # setup init_params = { - 'a_condition': 'no_match' + 'a_condition': 'not_a_match' } hyperparameters = { 'tunable': { 'this_is_not_conditional': { 'type': 'int', - 'default': 1 + 'default': 1, + 'range': [1, 10] }, 'this_is_conditional': { 'type': 'conditional', 'condition': 'a_condition', - 'default': 1, + 'default': { + 'type': 'float', + 'default': 0.1, + 'values': [0, 1] + }, 'values': { - 'match': { - 'type': 'int', - 'default': 0 - }, - '*': { + 'also_not_a_match': { 'type': 'str', - 'default': 'whatever' + 'default': 'a', + 'values': ['a', 'b'] + }, + 'neither_a_match': { + 'type': 'int', + 'default': 0, + 'range': [1, 10] } } } @@ -194,36 +210,45 @@ def test__get_tunable_condition_wildcard_match(self): expected = { 'this_is_not_conditional': { 'type': 'int', - 'default': 1 + 'default': 1, + 'range': [1, 10] }, 'this_is_conditional': { - 'type': 'str', - 'default': 'whatever' + 'type': 'float', + 'default': 0.1, + 'values': [0, 1] } } assert tunable == expected - def test__get_tunable_condition_no_match(self): - """If there is a conditional without match or wildcard, it is not returned.""" + def test__get_tunable_condition_default_null(self): + """If there is no match and default is null (None), this param is not included.""" # setup init_params = { - 'a_condition': 'no_match' + 'a_condition': 'not_a_match' } hyperparameters = { 'tunable': { 'this_is_not_conditional': { 'type': 'int', - 'default': 1 + 'default': 1, + 'range': [1, 10] }, 'this_is_conditional': { 'type': 'conditional', 'condition': 'a_condition', - 'default': 1, + 'default': None, 'values': { - 'match': { + 'also_not_a_match': { + 'type': 'str', + 'default': 'a', + 'values': ['a', 'b'] + }, + 'neither_a_match': { 'type': 'int', - 'default': 0 + 'default': 0, + 'range': [1, 10] } } } @@ -237,7 +262,58 @@ def test__get_tunable_condition_no_match(self): expected = { 'this_is_not_conditional': { 'type': 'int', - 'default': 1 + 'default': 1, + 'range': [1, 10] + } + } + assert tunable == expected + + def test__get_tunable_condition_match_null(self): + """If there is a match and it is null (None), this param is not included. + + This stands even if the default is not null. + """ + + # setup + init_params = { + 'a_condition': 'a_match' + } + hyperparameters = { + 'tunable': { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1, + 'range': [1, 10] + }, + 'this_is_conditional': { + 'type': 'conditional', + 'condition': 'a_condition', + 'default': { + 'type': 'float', + 'default': 0.1, + 'values': [0, 1] + }, + 'values': { + 'not_a_match': { + 'type': 'str', + 'default': 'a', + 'values': ['a', 'b'] + }, + 'a_match': None + } + } + } + } + + # run + tunable = MLBlock._get_tunable(hyperparameters, init_params) + + # assert + expected = { + 'this_is_not_conditional': { + 'type': 'int', + 'default': 1, + 'range': [1, 10] } } assert tunable == expected From 0ede2707da8e4d2866416cac28e2b03c69a68e47 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 10 Jan 2019 18:26:21 +0100 Subject: [PATCH 007/160] Release notes for v0.3.0 --- HISTORY.md | 7 +++++++ mlblocks/__init__.py | 2 +- setup.cfg | 7 +++---- setup.py | 2 +- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index d08624dc..a312c9cb 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,13 @@ Changelog ========= +0.3.0 - New Primitives Discovery +-------------------------------- + +* New primitives discovery system based on `entry_points`. +* Conditional Hyperparameters filtering in MLBlock initialization. +* Improved logging and exception reporting. + 0.2.4 - New Datasets and Unit Tests ----------------------------------- diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 43079986..3a9e6bcb 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -18,7 +18,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.2.5-dev' +__version__ = '0.3.0-dev' __all__ = [ 'MLBlock', 'MLPipeline', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index fbc69b07..a9255027 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,15 +1,15 @@ [bumpversion] -current_version = 0.2.5-dev +current_version = 0.3.0-dev commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? -serialize = +serialize = {major}.{minor}.{patch}-{release} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = release -values = +values = dev release @@ -45,4 +45,3 @@ collect_ignore = ['setup.py'] [tool:pylint] good-names = X,y - diff --git a/setup.py b/setup.py index 9d4b4cfc..5c21f44b 100644 --- a/setup.py +++ b/setup.py @@ -92,6 +92,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/MLBlocks', - version='0.2.5-dev', + version='0.3.0-dev', zip_safe=False, ) From bb0bb0d44bcc44e1517825409e1d092670ddde27 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 10 Jan 2019 18:33:45 +0100 Subject: [PATCH 008/160] =?UTF-8?q?Bump=20version:=200.3.0-dev=20=E2=86=92?= =?UTF-8?q?=200.3.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 7 ++++--- setup.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 3a9e6bcb..93bd80bb 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -18,7 +18,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.3.0-dev' +__version__ = '0.3.0' __all__ = [ 'MLBlock', 'MLPipeline', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index a9255027..3026d2ba 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,15 +1,15 @@ [bumpversion] -current_version = 0.3.0-dev +current_version = 0.3.0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? -serialize = +serialize = {major}.{minor}.{patch}-{release} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = release -values = +values = dev release @@ -45,3 +45,4 @@ collect_ignore = ['setup.py'] [tool:pylint] good-names = X,y + diff --git a/setup.py b/setup.py index 5c21f44b..a59a74f0 100644 --- a/setup.py +++ b/setup.py @@ -92,6 +92,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/MLBlocks', - version='0.3.0-dev', + version='0.3.0', zip_safe=False, ) From e1ca77bce3c4537c0800a4c1395e1b6bbde5465d Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 10 Jan 2019 18:34:07 +0100 Subject: [PATCH 009/160] =?UTF-8?q?Bump=20version:=200.3.0=20=E2=86=92=200?= =?UTF-8?q?.3.1-dev?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 93bd80bb..cf326495 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -18,7 +18,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.3.0' +__version__ = '0.3.1-dev' __all__ = [ 'MLBlock', 'MLPipeline', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index 3026d2ba..e976dec7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.0 +current_version = 0.3.1-dev commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? diff --git a/setup.py b/setup.py index a59a74f0..a8ac84d7 100644 --- a/setup.py +++ b/setup.py @@ -92,6 +92,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/MLBlocks', - version='0.3.0', + version='0.3.1-dev', zip_safe=False, ) From d3cbee730139b2d0117a1de1474a581844505196 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 19 Apr 2019 13:38:02 +0200 Subject: [PATCH 010/160] Initial implementation to work with intermediate outputs --- mlblocks/mlpipeline.py | 82 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 71 insertions(+), 11 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 058737ee..d5928b69 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -166,7 +166,7 @@ def _get_block_args(self, block_name, block_args, context): return kwargs - def _get_outputs(self, block_name, outputs, block_outputs): + def _extract_outputs(self, block_name, outputs, block_outputs): # TODO: type validation and/or transformation should be done here if not isinstance(outputs, tuple): @@ -188,7 +188,40 @@ def _get_outputs(self, block_name, outputs, block_outputs): return output_dict - def fit(self, X=None, y=None, **kwargs): + def _get_block_name(self, index): + return list(self.blocks.keys())[index] + + def _get_output_spec(self, output): + if output is None: + return None, None + + if isinstance(output, int): + output = self._get_block_name(output) + + if output in self.blocks: + return output, None + + if '.' in output: + output_block, output_variable = output.rsplit('.', 1) + if output_block not in self.blocks: + raise ValueError('Unknown block name: {}'.format(output_block)) + + return output_block, output_variable + + last_block_name = self._get_block_name(-1) + return last_block_name, output + + def _get_output(self, output_variable, context): + if output_variable: + if output_variable not in context: + raise ValueError('Output variable {} not found in context' + .format(output_variable)) + + return context[output_variable] + else: + return context + + def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs): """Fit the blocks of this pipeline. Sequentially call the `fit` and the `produce` methods of each block, @@ -213,8 +246,19 @@ def fit(self, X=None, y=None, **kwargs): } context.update(kwargs) - last_block_name = list(self.blocks.keys())[-1] + output_block, output_variable = self._get_output_spec(output) + last_block_name = self._get_block_name(-1) + + if isinstance(skip_to, int): + skip_to = self._get_block_name(skip_to) + for block_name, block in self.blocks.items(): + if block_name == skip_to: + skip_to = False + elif skip_to: + LOGGER.debug("Skipping block %s fit", block_name) + continue + LOGGER.debug("Fitting block %s", block_name) try: fit_args = self._get_block_args(block_name, block.fit_args, context) @@ -223,19 +267,22 @@ def fit(self, X=None, y=None, **kwargs): LOGGER.exception("Exception caught fitting MLBlock %s", block_name) raise - if block_name != last_block_name: + if (block_name != last_block_name) or (block_name == output_block): LOGGER.debug("Producing block %s", block_name) try: produce_args = self._get_block_args(block_name, block.produce_args, context) outputs = block.produce(**produce_args) - output_dict = self._get_outputs(block_name, outputs, block.produce_output) + output_dict = self._extract_outputs(block_name, outputs, block.produce_output) context.update(output_dict) except Exception: LOGGER.exception("Exception caught producing MLBlock %s", block_name) raise - def predict(self, X=None, **kwargs): + if block_name == output_block: + return self._get_output(output_variable, context) + + def predict(self, X=None, output='y', skip_to=None, **kwargs): """Produce predictions using the blocks of this pipeline. Sequentially call the `produce` method of each block, capturing the @@ -256,22 +303,35 @@ def predict(self, X=None, **kwargs): } context.update(kwargs) - last_block_name = list(self.blocks.keys())[-1] + output_block, output_variable = self._get_output_spec(output) + + if isinstance(skip_to, int): + skip_to = self._get_block_name(skip_to) + for block_name, block in self.blocks.items(): + if block_name == skip_to: + skip_to = False + elif skip_to: + LOGGER.debug("Skipping block %s produce", block_name) + continue + LOGGER.debug("Producing block %s", block_name) try: produce_args = self._get_block_args(block_name, block.produce_args, context) outputs = block.produce(**produce_args) + output_dict = self._extract_outputs(block_name, outputs, block.produce_output) + context.update(output_dict) - if block_name != last_block_name: - output_dict = self._get_outputs(block_name, outputs, block.produce_output) - context.update(output_dict) + if block_name == output_block: + return self._get_output(output_variable, context) except Exception: LOGGER.exception("Exception caught producing MLBlock %s", block_name) raise - return outputs + if skip_to: + # We skipped all the blocks up to the end + raise ValueError('Unknown block name: {}'.format(skip_to)) def to_dict(self): """Return all the details of this MLPipeline in a dict. From 59fae909d44afb78005425c6c4a24de567391eb5 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 6 May 2019 22:48:38 +0200 Subject: [PATCH 011/160] Update contributing guide to match the current release workflow --- CONTRIBUTING.rst | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 2db74080..4fce53bf 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -172,24 +172,26 @@ The process of releasing a new version involves several steps combining both ``g 1. Merge what is in ``master`` branch into ``stable`` branch. 2. Update the version in ``setup.cfg``, ``mlblocks/__init__.py`` and ``HISTORY.md`` files. -3. Create a new TAG pointing at the correspoding commit in ``stable`` branch. +3. Create a new git tag pointing at the corresponding commit in ``stable`` branch. 4. Merge the new commit from ``stable`` into ``master``. -5. Update the version in ``setup.cfg`` and ``mlblocks/__init__.py`` to open the next - development interation. +5. Update the version in ``setup.cfg`` and ``mlblocks/__init__.py`` + to open the next development iteration. -**Note:** Before starting the process, make sure that ``HISTORY.md`` has a section titled -**Unreleased** with the list of changes that will be included in the new version, and that -these changes are committed and available in ``master`` branch. -Normally this is just a list of the Pull Requests that have been merged since the latest version. +.. note:: Before starting the process, make sure that ``HISTORY.md`` has been updated with a new + entry that explains the changes that will be included in the new version. + Normally this is just a list of the Pull Requests that have been merged to master + since the last release. -Once this is done, just run the following commands:: +Once this is done, run of the following commands: + +1. If you are releasing a patch version:: - git checkout stable - git merge --no-ff master # This creates a merge commit - bumpversion release # This creates a new commit and a TAG - git push --tags origin stable make release - git checkout master - git merge stable - bumpversion --no-tag patch - git push + +2. If you are releasing a minor version:: + + make release-minor + +3. If you are releasing a major version:: + + make release-major From e768037076387fcb9a33e494c9c89421f0c657a8 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 6 May 2019 22:49:47 +0200 Subject: [PATCH 012/160] Update docs config --- Makefile | 1 - docs/changelog.rst | 2 +- docs/conf.py | 20 +++++++------------- setup.py | 1 - 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index dc62e90d..c2d2aaa4 100644 --- a/Makefile +++ b/Makefile @@ -122,7 +122,6 @@ coverage: ## check code coverage quickly with the default Python .PHONY: docs docs: clean-docs ## generate Sphinx HTML documentation, including API docs $(MAKE) -C docs html - touch docs/_build/html/.nojekyll .PHONY: view-docs view-docs: docs ## view docs in browser diff --git a/docs/changelog.rst b/docs/changelog.rst index fcd2eb2d..d26e5be8 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1 +1 @@ -.. include:: ../HISTORY.md +.. mdinclude:: ../HISTORY.md diff --git a/docs/conf.py b/docs/conf.py index 8659996f..9b4595ec 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,18 +18,9 @@ # relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. -import os -import sys - import sphinx_rtd_theme # For read the docs theme -from recommonmark.parser import CommonMarkParser -# from recommonmark.transform import AutoStructify - -# sys.path.insert(0, os.path.abspath('..')) import mlblocks -# -# mlblocks.add_primitives_path('../mlblocks_primitives') # -- General configuration --------------------------------------------- @@ -40,8 +31,11 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ - 'sphinx.ext.napoleon', + 'm2r', + 'sphinx.ext.autodoc', 'sphinx.ext.githubpages', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', 'sphinx.ext.graphviz', 'IPython.sphinxext.ipython_console_highlighting', 'IPython.sphinxext.ipython_directive', @@ -56,9 +50,9 @@ # You can specify multiple suffix as a list of string: source_suffix = ['.rst', '.md', '.ipynb'] -source_parsers = { - '.md': CommonMarkParser, -} +# source_parsers = { +# '.md': CommonMarkParser, +# } # The master toctree document. master_doc = 'index' diff --git a/setup.py b/setup.py index a8ac84d7..f6991ab1 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,6 @@ 'graphviz==0.9', 'ipython==6.5.0', 'matplotlib==2.2.3', - 'recommonmark>=0.4.0', # style check 'flake8>=3.5.0', From 080580d45c9b47680fbc31d30aee4e8478292711 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 6 May 2019 22:50:08 +0200 Subject: [PATCH 013/160] Remove spaces --- setup.cfg | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index e976dec7..62ced521 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,13 +3,13 @@ current_version = 0.3.1-dev commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? -serialize = +serialize = {major}.{minor}.{patch}-{release} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = release -values = +values = dev release @@ -45,4 +45,3 @@ collect_ignore = ['setup.py'] [tool:pylint] good-names = X,y - From e25fa6d3ac3af2f20b205ed73d91d28124bc8c16 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 6 May 2019 22:50:32 +0200 Subject: [PATCH 014/160] ADd docstrings --- mlblocks/mlpipeline.py | 127 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 113 insertions(+), 14 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index d5928b69..abbac922 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -69,6 +69,7 @@ class MLPipeline(): """ def _get_tunable_hyperparameters(self): + """Get the tunable hyperperparameters from all the blocks in this pipeline.""" tunable = {} for block_name, block in self.blocks.items(): tunable[block_name] = block.get_tunable_hyperparameters() @@ -140,6 +141,24 @@ def set_hyperparameters(self, hyperparameters): self.blocks[block_name].set_hyperparameters(block_hyperparams) def _get_block_args(self, block_name, block_args, context): + """Get the arguments expected by the block method from the context. + + The arguments will be taken from the context using both the method + arguments specification and the `input_names` given when the pipeline + was created. + + Args: + block_name (str): Name of this block. Used to find the corresponding + input_names. + block_args (list): list of method argument specifications from the + primitive. + context (dict): current context dictionary. + + Returns: + dict: + A dictionary containing the argument names and values to pass + to the method. + """ # TODO: type validation and/or transformation should be done here input_names = self.input_names.get(block_name, dict()) @@ -167,6 +186,7 @@ def _get_block_args(self, block_name, block_args, context): return kwargs def _extract_outputs(self, block_name, outputs, block_outputs): + """Extract the outputs of the method as a dict to be set into the context.""" # TODO: type validation and/or transformation should be done here if not isinstance(outputs, tuple): @@ -189,9 +209,36 @@ def _extract_outputs(self, block_name, outputs, block_outputs): return output_dict def _get_block_name(self, index): + """Get the name of the block in the `index` position.""" return list(self.blocks.keys())[index] def _get_output_spec(self, output): + """Parsre the output specification and get a block name and a variable name. + + The output specification can be of two types: int and str. + + If it is an integer, it is interpreted as a block index, and the variable name + is considered to be ``None``, which means that the whole context will be returned. + + If it is a string, it is interpreted as the block name, and it has to match a block + name exactly, including its hash and counter number ``#n``. Optionally, a variable + name can be passed at the end using a ``'.'`` as a separator. + In this case, the format of the string is `{block_name}.{variable_name}`. Note + that the block name can also contain dots, so only the leftmost dot will be + considered, and only if the complete string does not match exactly a block name. + + Args: + output (str or int): Output specification as either a string or an integer. + + Returns: + tuple: + The output is a tuple containing: + * block_name (str): name of the block from which the output will be + returned, including its counter number. + * variable_name (str): Name of the variable to extract from the context. + It can be ``None``, which means that the whole context is to be + returned. + """ if output is None: return None, None @@ -212,6 +259,10 @@ def _get_output_spec(self, output): return last_block_name, output def _get_output(self, output_variable, context): + """Get the specified output variable from the context. + + If the variable name is ``None``, return the entire context. + """ if output_variable: if output_variable not in context: raise ValueError('Output variable {} not found in context' @@ -221,7 +272,7 @@ def _get_output(self, output_variable, context): else: return context - def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs): + def fit(self, X=None, y=None, output=None, start_on=None, **kwargs): """Fit the blocks of this pipeline. Sequentially call the `fit` and the `produce` methods of each block, @@ -237,8 +288,32 @@ def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs): X: Fit Data, which the pipeline will learn from. y: Fit Data labels, which the pipeline will use to learn how to behave. + output (str or int): Output specification, which can be a string or an integer. + If an integer is given, it is interpreted as the block number, and the whole + context after running the specified block will be returned. + If a string is given, it is expected to be the name of one block, including + its counter number at the end. Optionally, a variable name can be included + at the end after the counter number using a ``'.'`` as a separator between the + block name and the variable name. If the variable name is given, this will be + extracted from the context and returned. Otherwise, the whole context will + be returned. + start_on (str or int): Block index or block name to start processing from. The + value can either be an integer, which will be interpreted as a block index, + or the name of a block, including the conter number at the end. + If given, the execution of the pipeline will start on the specified block, + and all the blocks before that one will be skipped. **kwargs: Any additional keyword arguments will be directly added to the context dictionary and available for the blocks. + + Returns: + None or dict or object: + * If no output is specified, nothing will be returned. + * If an output block has been specified without and output variable, the + context dictionary will be returned after the produce method of that block + has been called. + * If both an output block and an output variable have been specified, + the value of that variable from the context will extracted and returned + after the produce method of that block has been called. """ context = { 'X': X, @@ -249,13 +324,13 @@ def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs): output_block, output_variable = self._get_output_spec(output) last_block_name = self._get_block_name(-1) - if isinstance(skip_to, int): - skip_to = self._get_block_name(skip_to) + if isinstance(start_on, int): + start_on = self._get_block_name(start_on) for block_name, block in self.blocks.items(): - if block_name == skip_to: - skip_to = False - elif skip_to: + if block_name == start_on: + start_on = False + elif start_on: LOGGER.debug("Skipping block %s fit", block_name) continue @@ -282,7 +357,7 @@ def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs): if block_name == output_block: return self._get_output(output_variable, context) - def predict(self, X=None, output='y', skip_to=None, **kwargs): + def predict(self, X=None, output='y', start_on=None, **kwargs): """Produce predictions using the blocks of this pipeline. Sequentially call the `produce` method of each block, capturing the @@ -295,8 +370,32 @@ def predict(self, X=None, output='y', skip_to=None, **kwargs): Args: X: Data which the pipeline will use to make predictions. + output (str or int): Output specification, which can be a string or an integer. + If an integer is given, it is interpreted as the block number, and the whole + context after running the specified block will be returned. + If a string is given, it is expected to be the name of one block, including + its counter number at the end. Optionally, a variable name can be included + at the end after the counter number using a ``'.'`` as a separator between the + block name and the variable name. If the variable name is given, this will be + extracted from the context and returned. Otherwise, the whole context will + be returned. + start_on (str or int): Block index or block name to start processing from. The + value can either be an integer, which will be interpreted as a block index, + or the name of a block, including the conter number at the end. + If given, the execution of the pipeline will start on the specified block, + and all the blocks before that one will be skipped. **kwargs: Any additional keyword arguments will be directly added to the context dictionary and available for the blocks. + + Returns: + None or dict or object: + * If no output is specified, the output of the last block will be returned. + * If an output block has been specified without and output variable, the + context dictionary will be returned after the produce method of that block + has been called. + * If both an output block and an output variable have been specified, + the value of that variable from the context will extracted and returned + after the produce method of that block has been called. """ context = { 'X': X @@ -305,13 +404,13 @@ def predict(self, X=None, output='y', skip_to=None, **kwargs): output_block, output_variable = self._get_output_spec(output) - if isinstance(skip_to, int): - skip_to = self._get_block_name(skip_to) + if isinstance(start_on, int): + start_on = self._get_block_name(start_on) for block_name, block in self.blocks.items(): - if block_name == skip_to: - skip_to = False - elif skip_to: + if block_name == start_on: + start_on = False + elif start_on: LOGGER.debug("Skipping block %s produce", block_name) continue @@ -329,9 +428,9 @@ def predict(self, X=None, output='y', skip_to=None, **kwargs): LOGGER.exception("Exception caught producing MLBlock %s", block_name) raise - if skip_to: + if start_on: # We skipped all the blocks up to the end - raise ValueError('Unknown block name: {}'.format(skip_to)) + raise ValueError('Unknown block name: {}'.format(start_on)) def to_dict(self): """Return all the details of this MLPipeline in a dict. From 5e9be7aa7188d38ca6eafb684c24171b9e61f322 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 6 May 2019 22:51:09 +0200 Subject: [PATCH 015/160] Update primitive names to match the latest versions of MLPrimitives --- docs/getting_started/quickstart.rst | 2 +- docs/pipeline_examples/graph.rst | 4 ++-- docs/pipeline_examples/text.rst | 22 +++++++++++----------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst index 2e00ece6..2115fcef 100644 --- a/docs/getting_started/quickstart.rst +++ b/docs/getting_started/quickstart.rst @@ -24,7 +24,7 @@ them to the `MLPipeline class`_: from mlblocks import MLPipeline primitives = [ - 'mlprimitives.feature_extraction.StringVectorizer', + 'mlprimitives.custom.feature_extraction.StringVectorizer', 'sklearn.ensemble.RandomForestClassifier', ] pipeline = MLPipeline(primitives) diff --git a/docs/pipeline_examples/graph.rst b/docs/pipeline_examples/graph.rst index 5503e739..54ef85a1 100644 --- a/docs/pipeline_examples/graph.rst +++ b/docs/pipeline_examples/graph.rst @@ -39,7 +39,7 @@ additional information not found inside `X`. primitives = [ 'networkx.link_prediction_feature_extraction', - 'mlprimitives.feature_extraction.CategoricalEncoder', + 'mlprimitives.custom.feature_extraction.CategoricalEncoder', 'sklearn.preprocessing.StandardScaler', 'xgboost.XGBClassifier' ] @@ -69,6 +69,6 @@ additional information not found inside `X`. .. _NetworkX Link Prediction: https://networkx.github.io/documentation/networkx-1.10/reference/algorithms.link_prediction.html -.. _CategoricalEncoder from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.feature_extraction.CategoricalEncoder.json +.. _CategoricalEncoder from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json .. _StandardScaler from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html .. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn diff --git a/docs/pipeline_examples/text.rst b/docs/pipeline_examples/text.rst index df8a9d5a..03472ea3 100644 --- a/docs/pipeline_examples/text.rst +++ b/docs/pipeline_examples/text.rst @@ -40,31 +40,31 @@ for later ones. # set up the pipeline primitives = [ - "mlprimitives.counters.UniqueCounter", - "mlprimitives.text.TextCleaner", - "mlprimitives.counters.VocabularyCounter", + "mlprimitives.custom.counters.UniqueCounter", + "mlprimitives.custom.text.TextCleaner", + "mlprimitives.custom.counters.VocabularyCounter", "keras.preprocessing.text.Tokenizer", "keras.preprocessing.sequence.pad_sequences", "keras.Sequential.LSTMTextClassifier" ] input_names = { - "mlprimitives.counters.UniqueCounter#1": { + "mlprimitives.custom.counters.UniqueCounter#1": { "X": "y" } } output_names = { - "mlprimitives.counters.UniqueCounter#1": { + "mlprimitives.custom.counters.UniqueCounter#1": { "counts": "classes" }, - "mlprimitives.counters.VocabularyCounter#1": { + "mlprimitives.custom.counters.VocabularyCounter#1": { "counts": "vocabulary_size" } } init_params = { - "mlprimitives.counters.VocabularyCounter#1": { + "mlprimitives.custom.counters.VocabularyCounter#1": { "add": 1 }, - "mlprimitives.text.TextCleaner#1": { + "mlprimitives.custom.text.TextCleaner#1": { "language": "en" }, "keras.preprocessing.sequence.pad_sequences#1": { @@ -116,12 +116,12 @@ to encode all the string features, and go directly into the nltk.download('stopwords') primitives = [ - 'mlprimitives.text.TextCleaner', - 'mlprimitives.feature_extraction.StringVectorizer', + 'mlprimitives.custom.text.TextCleaner', + 'mlprimitives.custom.feature_extraction.StringVectorizer', 'sklearn.ensemble.RandomForestClassifier', ] init_params = { - 'mlprimitives.text.TextCleaner': { + 'mlprimitives.custom.text.TextCleaner': { 'column': 'text', 'language': 'nl' }, From 9f0ae6a3fa000896d8f530b72f6da46d23c31e4b Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 7 May 2019 17:12:33 +0200 Subject: [PATCH 016/160] Add random state to datasets get_splits --- mlblocks/datasets.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py index b5ed6b46..fb32df9c 100644 --- a/mlblocks/datasets.py +++ b/mlblocks/datasets.py @@ -141,7 +141,7 @@ def _get_split(data, index): else: return data[index] - def get_splits(self, n_splits=1): + def get_splits(self, n_splits=1, random_state=0): """Return splits of this dataset ready for Cross Validation. If n_splits is 1, a tuple containing the X for train and test @@ -166,12 +166,13 @@ def get_splits(self, n_splits=1): self.data, self.target, shuffle=self._shuffle, - stratify=stratify + stratify=stratify, + random_state=random_state ) else: cv_class = StratifiedKFold if self._stratify else KFold - cv = cv_class(n_splits=n_splits, shuffle=self._shuffle) + cv = cv_class(n_splits=n_splits, shuffle=self._shuffle, random_state=random_state) splits = list() for train, test in cv.split(self.data, self.target): From 5aea64755b7b7f9b4e68f6faa9a0912c1a55033a Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 7 May 2019 17:12:58 +0200 Subject: [PATCH 017/160] Rename output and start arguments --- mlblocks/mlpipeline.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index abbac922..91e44341 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -272,7 +272,7 @@ def _get_output(self, output_variable, context): else: return context - def fit(self, X=None, y=None, output=None, start_on=None, **kwargs): + def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): """Fit the blocks of this pipeline. Sequentially call the `fit` and the `produce` methods of each block, @@ -288,7 +288,7 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs): X: Fit Data, which the pipeline will learn from. y: Fit Data labels, which the pipeline will use to learn how to behave. - output (str or int): Output specification, which can be a string or an integer. + output_ (str or int): Output specification, which can be a string or an integer. If an integer is given, it is interpreted as the block number, and the whole context after running the specified block will be returned. If a string is given, it is expected to be the name of one block, including @@ -297,7 +297,7 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs): block name and the variable name. If the variable name is given, this will be extracted from the context and returned. Otherwise, the whole context will be returned. - start_on (str or int): Block index or block name to start processing from. The + start_ (str or int): Block index or block name to start processing from. The value can either be an integer, which will be interpreted as a block index, or the name of a block, including the conter number at the end. If given, the execution of the pipeline will start on the specified block, @@ -321,16 +321,16 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs): } context.update(kwargs) - output_block, output_variable = self._get_output_spec(output) + output_block, output_variable = self._get_output_spec(output_) last_block_name = self._get_block_name(-1) - if isinstance(start_on, int): - start_on = self._get_block_name(start_on) + if isinstance(start_, int): + start_ = self._get_block_name(start_) for block_name, block in self.blocks.items(): - if block_name == start_on: - start_on = False - elif start_on: + if block_name == start_: + start_ = False + elif start_: LOGGER.debug("Skipping block %s fit", block_name) continue @@ -357,7 +357,7 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs): if block_name == output_block: return self._get_output(output_variable, context) - def predict(self, X=None, output='y', start_on=None, **kwargs): + def predict(self, X=None, output_='y', start_=None, **kwargs): """Produce predictions using the blocks of this pipeline. Sequentially call the `produce` method of each block, capturing the @@ -370,7 +370,7 @@ def predict(self, X=None, output='y', start_on=None, **kwargs): Args: X: Data which the pipeline will use to make predictions. - output (str or int): Output specification, which can be a string or an integer. + output_ (str or int): Output specification, which can be a string or an integer. If an integer is given, it is interpreted as the block number, and the whole context after running the specified block will be returned. If a string is given, it is expected to be the name of one block, including @@ -379,7 +379,7 @@ def predict(self, X=None, output='y', start_on=None, **kwargs): block name and the variable name. If the variable name is given, this will be extracted from the context and returned. Otherwise, the whole context will be returned. - start_on (str or int): Block index or block name to start processing from. The + start_ (str or int): Block index or block name to start processing from. The value can either be an integer, which will be interpreted as a block index, or the name of a block, including the conter number at the end. If given, the execution of the pipeline will start on the specified block, @@ -402,15 +402,15 @@ def predict(self, X=None, output='y', start_on=None, **kwargs): } context.update(kwargs) - output_block, output_variable = self._get_output_spec(output) + output_block, output_variable = self._get_output_spec(output_) - if isinstance(start_on, int): - start_on = self._get_block_name(start_on) + if isinstance(start_, int): + start_ = self._get_block_name(start_) for block_name, block in self.blocks.items(): - if block_name == start_on: - start_on = False - elif start_on: + if block_name == start_: + start_ = False + elif start_: LOGGER.debug("Skipping block %s produce", block_name) continue @@ -428,9 +428,9 @@ def predict(self, X=None, output='y', start_on=None, **kwargs): LOGGER.exception("Exception caught producing MLBlock %s", block_name) raise - if start_on: + if start_: # We skipped all the blocks up to the end - raise ValueError('Unknown block name: {}'.format(start_on)) + raise ValueError('Unknown block name: {}'.format(start_)) def to_dict(self): """Return all the details of this MLPipeline in a dict. From 4607b3898aa9767774f872b936f2311492179746 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 7 May 2019 17:13:12 +0200 Subject: [PATCH 018/160] Add unit tests for partial outputs feature --- tests/features/test_partial_outputs.py | 133 +++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 tests/features/test_partial_outputs.py diff --git a/tests/features/test_partial_outputs.py b/tests/features/test_partial_outputs.py new file mode 100644 index 00000000..ce28d457 --- /dev/null +++ b/tests/features/test_partial_outputs.py @@ -0,0 +1,133 @@ +from unittest import TestCase +from unittest.mock import Mock + +import numpy as np + +from mlblocks.datasets import load_iris +from mlblocks.mlpipeline import MLPipeline + + +def almost_equal(obj1, obj2): + if isinstance(obj1, dict): + if not isinstance(obj2, dict): + raise AssertionError("{} is not equal to {}".format(type(obj2), dict)) + + for key, value in obj1.items(): + if key not in obj2: + raise AssertionError("{} not in {}".format(key, obj2)) + almost_equal(value, obj2[key]) + + else: + np.testing.assert_almost_equal(obj1, obj2) + + +class TestPartialOutputs(TestCase): + def setUp(self): + dataset = load_iris() + + self.X_train, self.X_test, self.y_train, self.y_test = dataset.get_splits(1) + + def test_fit_output(self): + + # Setup variables + primitives = [ + 'sklearn.preprocessing.StandardScaler', + 'sklearn.linear_model.LogisticRegression' + ] + pipeline = MLPipeline(primitives) + + int_block = 0 + invalid_int = 10 + str_block = 'sklearn.preprocessing.StandardScaler#1' + invalid_block = 'InvalidBlockName' + str_block_variable = 'sklearn.preprocessing.StandardScaler#1.y' + invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid' + + # Run + int_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=int_block) + str_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=str_block) + str_out_variable = pipeline.fit(self.X_train[0:5], self.y_train[0:5], + output_=str_block_variable) + no_output = pipeline.fit(self.X_train, self.y_train) + + # Assert successful calls + X = np.array([ + [0.71269665, -1.45152899, 0.55344946, 0.31740553], + [0.26726124, 1.23648766, -1.1557327, -1.0932857], + [-1.95991577, 0.967686, -1.1557327, -1.0932857], + [0.71269665, -0.645124, 0.39067021, 0.31740553], + [0.26726124, -0.10752067, 1.36734573, 1.55176035] + ]) + y = np.array([1, 0, 0, 1, 2]) + context = { + 'X': X, + 'y': y + } + almost_equal(context, int_out) + almost_equal(context, str_out) + + almost_equal(y, str_out_variable) + + assert no_output is None + + # Run asserting exceptions + with self.assertRaises(IndexError): + pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_int) + + with self.assertRaises(ValueError): + pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_block) + + with self.assertRaises(ValueError): + pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_variable) + + def test_fit_start(self): + # Setup variables + primitives = [ + 'sklearn.preprocessing.StandardScaler', + 'sklearn.linear_model.LogisticRegression' + ] + pipeline = MLPipeline(primitives) + + # Mock the first block + block_mock = Mock() + pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock + + # Run first block + context = { + 'X': self.X_train, + 'y': self.y_train + } + int_start = 1 + str_start = 'sklearn.linear_model.LogisticRegression#1' + + pipeline.fit(start_=int_start, **context) + pipeline.fit(start_=str_start, **context) + + # Assert that mock has not been called + block_mock.fit.assert_not_called() + + def test_predict_start(self): + # Setup variables + primitives = [ + 'sklearn.preprocessing.StandardScaler', + 'sklearn.linear_model.LogisticRegression' + ] + pipeline = MLPipeline(primitives) + pipeline.fit(self.X_train, self.y_train) + + # Mock the first block + block_mock = Mock() + pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock + + # Run first block + context = { + 'X': self.X_train, + } + int_start = 1 + str_start = 'sklearn.linear_model.LogisticRegression#1' + + pipeline.predict(start_=int_start, **context) + pipeline.predict(start_=str_start, **context) + + # Assert that mock has not been called + block_mock.predict.assert_not_called() From 980794b67165e286d49cb81cf742ea44fd760365 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 9 May 2019 15:14:23 +0200 Subject: [PATCH 019/160] Improve docstrings and add toc in autogenerated API reference --- Makefile | 5 + docs/conf.py | 9 +- mlblocks/datasets.py | 12 +- mlblocks/mlblock.py | 79 +++++++------ mlblocks/mlpipeline.py | 256 +++++++++++++++++++++++++++-------------- mlblocks/primitives.py | 3 +- setup.cfg | 6 + setup.py | 4 + 8 files changed, 234 insertions(+), 140 deletions(-) diff --git a/Makefile b/Makefile index c2d2aaa4..6266033f 100644 --- a/Makefile +++ b/Makefile @@ -98,6 +98,11 @@ fix-lint: ## fix lint issues using autoflake, autopep8, and isort autopep8 --in-place --recursive --aggressive tests isort --apply --atomic --recursive tests +.PHONY: lint-docs +lint-docs: ## check docs formatting with doc8 and pydocstyle + doc8 mlblocks/ + pydocstyle mlblocks/ + # TEST TARGETS diff --git a/docs/conf.py b/docs/conf.py index 9b4595ec..95653914 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -39,8 +39,13 @@ 'sphinx.ext.graphviz', 'IPython.sphinxext.ipython_console_highlighting', 'IPython.sphinxext.ipython_directive', + 'autodocsumm', ] +autodoc_default_options = { + 'autosummary': True, +} + ipython_execlines = ["import pandas as pd", "pd.set_option('display.width', 1000000)"] # Add any paths that contain templates here, relative to this directory. @@ -50,10 +55,6 @@ # You can specify multiple suffix as a list of string: source_suffix = ['.rst', '.md', '.ipynb'] -# source_parsers = { -# '.md': CommonMarkParser, -# } - # The master toctree document. master_doc = 'index' diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py index fb32df9c..0c69afda 100644 --- a/mlblocks/datasets.py +++ b/mlblocks/datasets.py @@ -100,6 +100,7 @@ class Dataset(): **kwargs: Any additional keyword argument passed on initialization will be made available as instance attributes. """ + def __init__(self, description, data, target, score, shuffle=True, stratify=False, **kwargs): self.name = description.splitlines()[0] @@ -115,10 +116,10 @@ def __init__(self, description, data, target, score, shuffle=True, stratify=Fals self.__dict__.update(kwargs) def score(self, *args, **kwargs): - """Scoring function for this dataset. + r"""Scoring function for this dataset. Args: - \\*args, \\*\\*kwargs: Any given arguments and keyword arguments will be + \*args, \*\*kwargs: Any given arguments and keyword arguments will be directly passed to the given scoring function. Returns: @@ -315,7 +316,6 @@ def load_dic28(): There exist 52,652 words (vertices in a network) having 2 up to 8 characters in the dictionary. The obtained network has 89038 edges. """ - dataset_path = _load('dic28') X = _load_csv(dataset_path, 'data') @@ -344,7 +344,6 @@ def load_nomination(): Data consists of one graph whose nodes contain two attributes, attr1 and attr2. Associated with each node is a label that has to be learned and predicted. """ - dataset_path = _load('nomination') X = _load_csv(dataset_path, 'data') @@ -363,7 +362,6 @@ def load_amazon(): co-purchased with product j, the graph contains an undirected edge from i to j. Each product category provided by Amazon defines each ground-truth community. """ - dataset_path = _load('amazon') X = _load_csv(dataset_path, 'data') @@ -383,7 +381,6 @@ def load_jester(): source: "University of California Berkeley, CA" sourceURI: "http://eigentaste.berkeley.edu/dataset/" """ - dataset_path = _load('jester') X = _load_csv(dataset_path, 'data') @@ -393,7 +390,7 @@ def load_jester(): def load_wikiqa(): - """A Challenge Dataset for Open-Domain Question Answering. + """Challenge Dataset for Open-Domain Question Answering. WikiQA dataset is a publicly available set of question and sentence (QS) pairs, collected and annotated for research on open-domain question answering. @@ -401,7 +398,6 @@ def load_wikiqa(): source: "Microsoft" sourceURI: "https://www.microsoft.com/en-us/research/publication/wikiqa-a-challenge-dataset-for-open-domain-question-answering/#" """ # noqa - dataset_path = _load('wikiqa') data = _load_csv(dataset_path, 'data', set_index=True) diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index a5cdb6a4..c3878e68 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -25,32 +25,34 @@ class MLBlock(): as wrapping them and providing a common interface to run them. Attributes: - name (str): Name given to this MLBlock. - primitive (object): the actual function or instance which this MLBlock - wraps. - fit_args (dict): specification of the arguments expected by the `fit` - method. - fit_method (str): name of the primitive method to call on `fit`. - `None` if the primitive is a function. - produce_args (dict): specification of the arguments expected by the - `predict` method. - produce_output (dict): specification of the outputs of the `produce` - method. - produce_method (str): name of the primitive method to call on - `produce`. `None` if the primitive is a function. + name (str): + Name given to this MLBlock. + primitive (object): + the actual function or instance which this MLBlock wraps. + fit_args (dict): + specification of the arguments expected by the `fit` method. + fit_method (str): + name of the primitive method to call on `fit`. `None` if the primitive is a function. + produce_args (dict): + specification of the arguments expected by the `predict` method. + produce_output (dict): + specification of the outputs of the `produce` method. + produce_method (str): + name of the primitive method to call on `produce`. `None` if the primitive is a + function. Args: - name (str): Name given to this MLBlock. - **kwargs: Any additional arguments that will be used as - hyperparameters or passed to the `fit` or `produce` - methods. + name (str): + Name given to this MLBlock. + **kwargs: + Any additional arguments that will be used as hyperparameters or passed to the + `fit` or `produce` methods. Raises: - TypeError: A `TypeError` is raised if a required argument is not - found within the `kwargs` or if an unexpected - argument has been given. - """ - # pylint: disable=too-many-instance-attributes + TypeError: + A `TypeError` is raised if a required argument is not found within the `kwargs` + or if an unexpected argument has been given. + """ # pylint: disable=too-many-instance-attributes def _extract_params(self, kwargs, hyperparameters): """Extract init, fit and produce params from kwargs. @@ -63,16 +65,16 @@ def _extract_params(self, kwargs, hyperparameters): have been given and that nothing unexpected exists in the input. Args: - kwargs (dict): dict containing the Keyword arguments that have - been passed to the `__init__` method upon - initialization. - hyperparameters (dict): hyperparameters dictionary, as found in - the JSON annotation. + kwargs (dict): + dict containing the Keyword arguments that have been passed to the `__init__` + method upon initialization. + hyperparameters (dict): + hyperparameters dictionary, as found in the JSON annotation. Raises: - TypeError: A `TypeError` is raised if a required argument is not - found in the `kwargs` dict, or if an unexpected - argument has been given. + TypeError: + A `TypeError` is raised if a required argument is not found in the `kwargs` dict, + or if an unexpected argument has been given. """ init_params = dict() fit_params = dict() @@ -138,7 +140,6 @@ def _get_tunable(cls, hyperparameters, init_params): return tunable def __init__(self, name, **kwargs): - self.name = name metadata = load_primitive(name) @@ -174,6 +175,7 @@ def __init__(self, name, **kwargs): self.set_hyperparameters(default) def __str__(self): + """Return a string that represents this block.""" return 'MLBlock - {}'.format(self.name) def get_tunable_hyperparameters(self): @@ -210,9 +212,9 @@ def set_hyperparameters(self, hyperparameters): If necessary, a new instance of the primitive is created. Args: - hyperparameters (dict): Dictionary containing as keys the name - of the hyperparameters and as values - the values to be used. + hyperparameters (dict): + Dictionary containing as keys the name of the hyperparameters and as + values the values to be used. """ self._hyperparameters.update(hyperparameters) @@ -233,12 +235,13 @@ def fit(self, **kwargs): the primitive is a simple function, this will be a noop. Args: - **kwargs: Any given keyword argument will be directly passed - to the primitive fit method. + **kwargs: + Any given keyword argument will be directly passed to the primitive fit method. Raises: - TypeError: A `TypeError` might be raised if any argument not - expected by the primitive fit method is given. + TypeError: + A `TypeError` might be raised if any argument not expected by the primitive fit + method is given. """ if self.fit_method is not None: fit_args = self._fit_params.copy() diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 91e44341..eddb442e 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -34,38 +34,35 @@ class MLPipeline(): results, which will be returned as the prediction of the pipeline. Attributes: - primitives (list): List of the names of the primitives that compose - this pipeline. - blocks (list): OrderedDict of the block names and the corresponding - MLBlock instances. - init_params (dict): init_params dictionary, as given when the instance - was created. - input_names (dict): input_names dictionary, as given when the instance - was created. - output_names (dict): output_names dictionary, as given when the instance - was created. + primitives (list): + List of the names of the primitives that compose this pipeline. + blocks (list): + OrderedDict of the block names and the corresponding MLBlock instances. + init_params (dict): + init_params dictionary, as given when the instance was created. + input_names (dict): + input_names dictionary, as given when the instance was created. + output_names (dict): + output_names dictionary, as given when the instance was created. Args: - primitives (list): List with the names of the primitives that will - compose this pipeline. - init_params (dict): dictionary containing initialization arguments to - be passed when creating the MLBlocks instances. - The dictionary keys must be the corresponding - primitive names and the values must be another - dictionary that will be passed as `**kargs` to the - MLBlock instance. - input_names (dict): dictionary that maps input variable names with the - actual names expected by each primitive. This - allows reusing the same input argument for multiple - primitives that name it differently, as well as - passing different values to primitives that expect - arguments named similary. - output_names (dict): dictionary that maps output variable names with - the name these variables will be given when stored - in the context dictionary. This allows storing - the output of different primitives in different - variables, even if the primitive output name is - the same one. + primitives (list): + List with the names of the primitives that will compose this pipeline. + init_params (dict): + dictionary containing initialization arguments to be passed when creating the + MLBlocks instances. The dictionary keys must be the corresponding primitive names + and the values must be another dictionary that will be passed as `**kargs` to the + MLBlock instance. + input_names (dict): + dictionary that maps input variable names with the actual names expected by each + primitive. This allows reusing the same input argument for multiple primitives that + name it differently, as well as passing different values to primitives that expect + arguments named similary. + output_names (dict): + dictionary that maps output variable names with the name these variables will be + given when stored in the context dictionary. This allows storing the output of + different primitives in different variables, even if the primitive output name is + the same one. """ def _get_tunable_hyperparameters(self): @@ -133,9 +130,9 @@ def set_hyperparameters(self, hyperparameters): """Set new hyperparameter values for some blocks. Args: - hyperparameters (dict): A dictionary containing the block names as - keys and the new hyperparameters dictionary - as values. + hyperparameters (dict): + A dictionary containing the block names as keys and the new hyperparameters + dictionary as values. """ for block_name, block_hyperparams in hyperparameters.items(): self.blocks[block_name].set_hyperparameters(block_hyperparams) @@ -148,11 +145,12 @@ def _get_block_args(self, block_name, block_args, context): was created. Args: - block_name (str): Name of this block. Used to find the corresponding - input_names. - block_args (list): list of method argument specifications from the - primitive. - context (dict): current context dictionary. + block_name (str): + Name of this block. Used to find the corresponding input_names. + block_args (list): + list of method argument specifications from the primitive. + context (dict): + current context dictionary. Returns: dict: @@ -213,22 +211,40 @@ def _get_block_name(self, index): return list(self.blocks.keys())[index] def _get_output_spec(self, output): - """Parsre the output specification and get a block name and a variable name. + """Parse the output specification and get a block name and a variable name. The output specification can be of two types: int and str. If it is an integer, it is interpreted as a block index, and the variable name is considered to be ``None``, which means that the whole context will be returned. - If it is a string, it is interpreted as the block name, and it has to match a block - name exactly, including its hash and counter number ``#n``. Optionally, a variable - name can be passed at the end using a ``'.'`` as a separator. - In this case, the format of the string is `{block_name}.{variable_name}`. Note - that the block name can also contain dots, so only the leftmost dot will be - considered, and only if the complete string does not match exactly a block name. + If it is a string, it can be interpreted in three ways: + + * **block name**: If the string matches a block name exactly, including + its hash and counter number ``#n`` at the end, the whole context will be + returned after that block is produced. + * **variable_name**: If the string does not match any block name and does + not contain any dot characted, ``'.'``, it will be considered a variable + name. In this case, the indicated variable will be extracted from the + context and returned after the last block has been produced. + * **block_name + variable_name**: If the complete string does not match a + block name but it contains at least one dot, ``'.'``, it will be split + in two parts on the last dot. If the first part of the string matches a + block name exactly, the second part of the string will be considered a + variable name, assuming the format ``{block_name}.{variable_name}``, and + the indicated variable will be extracted from the context and returned + after the block has been produced. Otherwise, if the extracted + ``block_name`` does not match a block name exactly, a ``ValueError`` + will be raised. Args: - output (str or int): Output specification as either a string or an integer. + output (str or int): + Output specification as either a string or an integer. + + Raises: + ValueError: + If the output string contains dots but it does not match a block + name exactly Returns: tuple: @@ -239,15 +255,21 @@ def _get_output_spec(self, output): It can be ``None``, which means that the whole context is to be returned. """ + # If None is given, both block and varialbe are None if output is None: return None, None + # If an int is given, it is a block index and there is no variable if isinstance(output, int): output = self._get_block_name(output) + return output, None + # If the string matches a block name, there is no variable if output in self.blocks: return output, None + # If there is at least one dot in the output, but it did not match + # a block name, it is considered to be {block_name}.{variable_name} if '.' in output: output_block, output_variable = output.rsplit('.', 1) if output_block not in self.blocks: @@ -255,6 +277,9 @@ def _get_output_spec(self, output): return output_block, output_variable + # If the given string is not a block name and it has no dots, + # it is considered to be a variable name to be extracted + # from the context after the last block has been produced last_block_name = self._get_block_name(-1) return last_block_name, output @@ -285,25 +310,48 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): `produce` calls will be taken. Args: - X: Fit Data, which the pipeline will learn from. - y: Fit Data labels, which the pipeline will use to learn how to - behave. - output_ (str or int): Output specification, which can be a string or an integer. - If an integer is given, it is interpreted as the block number, and the whole - context after running the specified block will be returned. - If a string is given, it is expected to be the name of one block, including - its counter number at the end. Optionally, a variable name can be included - at the end after the counter number using a ``'.'`` as a separator between the - block name and the variable name. If the variable name is given, this will be - extracted from the context and returned. Otherwise, the whole context will - be returned. - start_ (str or int): Block index or block name to start processing from. The + X: + Fit Data, which the pipeline will learn from. + + y: + Fit Data labels, which the pipeline will use to learn how to + behave. + + output_ (str or int or None): + Output specification, which can be a string or an integer or None. + + * If it is None (default), nothing will be returned + * If an integer is given, it is interpreted as the block number, and the whole + context after running the specified block will be returned. + * If it is a string, it can be interpreted in three ways: + + * **block name**: If the string matches a block name exactly, including + its hash and counter number ``#n`` at the end, the whole context will be + returned after that block is produced. + * **variable_name**: If the string does not match any block name and does + not contain any dot characted, ``'.'``, it will be considered a variable + name. In this case, the indicated variable will be extracted from the + context and returned after the last block has been produced. + * **block_name + variable_name**: If the complete string does not match a + block name but it contains at least one dot, ``'.'``, it will be split + in two parts on the last dot. If the first part of the string matches a + block name exactly, the second part of the string will be considered a + variable name, assuming the format ``{block_name}.{variable_name}``, and + the indicated variable will be extracted from the context and returned + after the block has been produced. Otherwise, if the extracted + ``block_name`` does not match a block name exactly, a ``ValueError`` + will be raised. + + start_ (str or int or None): + Block index or block name to start processing from. The value can either be an integer, which will be interpreted as a block index, or the name of a block, including the conter number at the end. If given, the execution of the pipeline will start on the specified block, and all the blocks before that one will be skipped. - **kwargs: Any additional keyword arguments will be directly added - to the context dictionary and available for the blocks. + + **kwargs: + Any additional keyword arguments will be directly added + to the context dictionary and available for the blocks. Returns: None or dict or object: @@ -328,11 +376,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): start_ = self._get_block_name(start_) for block_name, block in self.blocks.items(): - if block_name == start_: - start_ = False - elif start_: - LOGGER.debug("Skipping block %s fit", block_name) - continue + if start_: + if block_name == start_: + start_ = False + else: + LOGGER.debug("Skipping block %s fit", block_name) + continue LOGGER.debug("Fitting block %s", block_name) try: @@ -357,7 +406,11 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): if block_name == output_block: return self._get_output(output_variable, context) - def predict(self, X=None, output_='y', start_=None, **kwargs): + if start_: + # We skipped all the blocks up to the end + raise ValueError('Unknown block name: {}'.format(start_)) + + def predict(self, X=None, output_=None, start_=None, **kwargs): """Produce predictions using the blocks of this pipeline. Sequentially call the `produce` method of each block, capturing the @@ -369,23 +422,43 @@ def predict(self, X=None, output_='y', start_=None, **kwargs): will be taken. Args: - X: Data which the pipeline will use to make predictions. - output_ (str or int): Output specification, which can be a string or an integer. - If an integer is given, it is interpreted as the block number, and the whole - context after running the specified block will be returned. - If a string is given, it is expected to be the name of one block, including - its counter number at the end. Optionally, a variable name can be included - at the end after the counter number using a ``'.'`` as a separator between the - block name and the variable name. If the variable name is given, this will be - extracted from the context and returned. Otherwise, the whole context will - be returned. - start_ (str or int): Block index or block name to start processing from. The + X: + Data which the pipeline will use to make predictions. + + output_ (str or int or None): + Output specification, which can be a string or an integer or None. + * If it is None (default), the output of the last block will be returned. + * If an integer is given, it is interpreted as the block number, and the whole + context after running the specified block will be returned. + * If it is a string, it can be interpreted in three ways: + + * **block name**: If the string matches a block name exactly, including + its hash and counter number ``#n`` at the end, the whole context will be + returned after that block is produced. + * **variable_name**: If the string does not match any block name and does + not contain any dot characted, ``'.'``, it will be considered a variable + name. In this case, the indicated variable will be extracted from the + context and returned after the last block has been produced. + * **block_name + variable_name**: If the complete string does not match a + block name but it contains at least one dot, ``'.'``, it will be split + in two parts on the last dot. If the first part of the string matches a + block name exactly, the second part of the string will be considered a + variable name, assuming the format ``{block_name}.{variable_name}``, and + the indicated variable will be extracted from the context and returned + after the block has been produced. Otherwise, if the extracted + ``block_name`` does not match a block name exactly, a ``ValueError`` + will be raised. + + start_ (str or int or None): + Block index or block name to start processing from. The value can either be an integer, which will be interpreted as a block index, or the name of a block, including the conter number at the end. If given, the execution of the pipeline will start on the specified block, and all the blocks before that one will be skipped. - **kwargs: Any additional keyword arguments will be directly added - to the context dictionary and available for the blocks. + + **kwargs: + Any additional keyword arguments will be directly added + to the context dictionary and available for the blocks. Returns: None or dict or object: @@ -408,11 +481,12 @@ def predict(self, X=None, output_='y', start_=None, **kwargs): start_ = self._get_block_name(start_) for block_name, block in self.blocks.items(): - if block_name == start_: - start_ = False - elif start_: - LOGGER.debug("Skipping block %s produce", block_name) - continue + if start_: + if block_name == start_: + start_ = False + else: + LOGGER.debug("Skipping block %s produce", block_name) + continue LOGGER.debug("Producing block %s", block_name) try: @@ -432,6 +506,9 @@ def predict(self, X=None, output_='y', start_=None, **kwargs): # We skipped all the blocks up to the end raise ValueError('Unknown block name: {}'.format(start_)) + if output_ is None: + return outputs + def to_dict(self): """Return all the details of this MLPipeline in a dict. @@ -487,7 +564,8 @@ def save(self, path): The content of the JSON file is the dict returned by the `to_dict` method. Args: - path (str): Path to the JSON file to write. + path (str): + Path to the JSON file to write. """ with open(path, 'w') as out_file: json.dump(self.to_dict(), out_file, indent=4) @@ -499,7 +577,8 @@ def from_dict(cls, metadata): The dict structure is the same as the one created by the `to_dict` method. Args: - metadata (dict): Dictionary containing the pipeline specification. + metadata (dict): + Dictionary containing the pipeline specification. Returns: MLPipeline: @@ -531,7 +610,8 @@ def load(cls, path): The JSON file format is the same as the one created by the `to_dict` method. Args: - path (str): Path of the JSON file to load. + path (str): + Path of the JSON file to load. Returns: MLPipeline: diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py index 9bca6a5d..f2300f67 100644 --- a/mlblocks/primitives.py +++ b/mlblocks/primitives.py @@ -37,6 +37,7 @@ def add_primitives_path(path): Raises: ValueError: A `ValueError` will be raised if the path is not valid. + """ if path not in _PRIMITIVES_PATHS: if not os.path.isdir(path): @@ -68,7 +69,6 @@ def get_primitives_paths(): list: The list of folders. """ - primitives_paths = list() entry_points = pkg_resources.iter_entry_points('mlprimitives') for entry_point in entry_points: @@ -99,7 +99,6 @@ def load_primitive(name): ValueError: A `ValueError` will be raised if the primitive cannot be found. """ - for base_path in get_primitives_paths(): parts = name.split('.') number_of_parts = len(parts) diff --git a/setup.cfg b/setup.cfg index 62ced521..17244565 100644 --- a/setup.cfg +++ b/setup.cfg @@ -45,3 +45,9 @@ collect_ignore = ['setup.py'] [tool:pylint] good-names = X,y + +[doc8] +max-line-length = 99 + +[pydocstyle] +add-ignore = D403,D413,D105,D107 diff --git a/setup.py b/setup.py index f6991ab1..c73eb0a6 100644 --- a/setup.py +++ b/setup.py @@ -59,6 +59,10 @@ # Advanced testing 'tox>=2.9.1', 'coverage>=4.5.1', + + # Documentation style + 'doc8==0.8.0', + 'pydocstyle==3.0.0' ] From 711201650e50e7ef0c3861347ac89abfa1a5c77d Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 9 May 2019 15:42:10 +0200 Subject: [PATCH 020/160] Add missing dependency --- setup.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index c73eb0a6..f355be93 100644 --- a/setup.py +++ b/setup.py @@ -40,9 +40,10 @@ 'm2r>=0.2.0', 'Sphinx>=1.7.1', 'sphinx_rtd_theme>=0.2.4', - 'graphviz==0.9', - 'ipython==6.5.0', - 'matplotlib==2.2.3', + 'graphviz>=0.9', + 'ipython>=6.5.0', + 'matplotlib>=2.2.3', + 'autodocsumm>=0.1.10', # style check 'flake8>=3.5.0', @@ -61,8 +62,8 @@ 'coverage>=4.5.1', # Documentation style - 'doc8==0.8.0', - 'pydocstyle==3.0.0' + 'doc8>=0.8.0', + 'pydocstyle>=3.0.0' ] From b26e527117cc45f94ed87c558e528a9a3276ff6f Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 16 May 2019 19:50:47 +0200 Subject: [PATCH 021/160] Move default and keyword arguments logic to MLBlock --- mlblocks/mlblock.py | 55 ++++++++++++++++++++++++++++++++++++------ mlblocks/mlpipeline.py | 14 +---------- setup.py | 3 ++- 3 files changed, 50 insertions(+), 22 deletions(-) diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index c3878e68..80f5baa2 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -222,6 +222,43 @@ def set_hyperparameters(self, hyperparameters): LOGGER.debug('Creating a new primitive instance for %s', self.name) self.instance = self.primitive(**self._hyperparameters) + def _get_method_kwargs(self, kwargs, method_args): + """Prepare the kwargs for the method. + + The kwargs dict will be altered according to the method_kwargs + specification to make them ready for the primitive method to + accept them. + + Args: + kwargs (dict): + keyword arguments that have been passed to the block method. + method_args (list): + method arguments as specified in the JSON annotation. + + Returns: + dict: + A dictionary containing the argument names and values to pass + to the primitive method. + """ + + method_kwargs = dict() + for arg in method_args: + name = arg['name'] + keyword = arg.get('keyword', name) + + if name in kwargs: + value = kwargs[name] + + elif 'default' in arg: + value = arg['default'] + + else: + raise TypeError("missing expected argument '{}'".format(name)) + + method_kwargs[keyword] = value + + return method_kwargs + def fit(self, **kwargs): """Call the fit method of the primitive. @@ -244,9 +281,10 @@ def fit(self, **kwargs): method is given. """ if self.fit_method is not None: - fit_args = self._fit_params.copy() - fit_args.update(kwargs) - getattr(self.instance, self.fit_method)(**fit_args) + fit_kwargs = self._fit_params.copy() + fit_kwargs.update(kwargs) + fit_kwargs = self._get_method_kwargs(fit_kwargs, self.fit_args) + getattr(self.instance, self.fit_method)(**fit_kwargs) def produce(self, **kwargs): """Call the primitive function, or the predict method of the primitive. @@ -262,10 +300,11 @@ def produce(self, **kwargs): The output of the call to the primitive function or primitive produce method. """ - produce_args = self._produce_params.copy() - produce_args.update(kwargs) + produce_kwargs = self._produce_params.copy() + produce_kwargs.update(kwargs) + produce_kwargs = self._get_method_kwargs(produce_kwargs, self.produce_args) if self._class: - return getattr(self.instance, self.produce_method)(**produce_args) + return getattr(self.instance, self.produce_method)(**produce_kwargs) - produce_args.update(self._hyperparameters) - return self.primitive(**produce_args) + produce_kwargs.update(self._hyperparameters) + return self.primitive(**produce_kwargs) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index eddb442e..9a0a109e 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -164,22 +164,10 @@ def _get_block_args(self, block_name, block_args, context): kwargs = dict() for arg in block_args: name = arg['name'] - keyword = arg.get('keyword', name) variable = input_names.get(name, name) if variable in context: - value = context[variable] - - elif 'default' in arg: - value = arg['default'] - - else: - raise TypeError( - "Expected argument '{}.{}' not found in context" - .format(block_name, variable) - ) - - kwargs[keyword] = value + kwargs[name] = context[variable] return kwargs diff --git a/setup.py b/setup.py index f355be93..9fca4dfa 100644 --- a/setup.py +++ b/setup.py @@ -15,13 +15,14 @@ install_requires = [ - 'mlprimitives>=0.1.3', ] tests_require = [ 'pytest>=3.4.2', 'pytest-cov>=2.6.0', + 'mlprimitives>=0.1.3,<0.2', + 'urllib3>=1.20,<1.25' ] From 00f11647ab11456f5e2d6761cd36170796ac5250 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 21 May 2019 12:16:33 -0400 Subject: [PATCH 022/160] Load pipelines by name --- mlblocks/__init__.py | 8 +- mlblocks/discovery.py | 263 ++++++++++++++++++ mlblocks/mlblock.py | 2 +- mlblocks/mlpipeline.py | 70 ++++- mlblocks/primitives.py | 116 -------- tests/features/test_pipeline_loading.py | 106 +++++++ .../{test_primitives.py => test_discovery.py} | 40 +-- tests/test_mlpipeline.py | 6 +- 8 files changed, 460 insertions(+), 151 deletions(-) create mode 100644 mlblocks/discovery.py delete mode 100644 mlblocks/primitives.py create mode 100644 tests/features/test_pipeline_loading.py rename tests/{test_primitives.py => test_discovery.py} (60%) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index cf326495..37199013 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -10,9 +10,11 @@ * Documentation: https://HDI-Project.github.io/MLBlocks """ +from mlblocks.discovery import ( + add_pipelines_path, add_primitives_path, get_pipelines_paths, get_primitives_paths, + load_pipeline, load_primitive) from mlblocks.mlblock import MLBlock from mlblocks.mlpipeline import MLPipeline -from mlblocks.primitives import add_primitives_path, get_primitives_paths, load_primitive __author__ = 'MIT Data To AI Lab' __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' @@ -21,6 +23,6 @@ __version__ = '0.3.1-dev' __all__ = [ - 'MLBlock', 'MLPipeline', 'add_primitives_path', - 'get_primitives_paths', 'load_primitive' + 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path', + 'get_pipelines_paths', 'get_primitives_paths', 'load_pipeline', 'load_primitive' ] diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py new file mode 100644 index 00000000..78f12021 --- /dev/null +++ b/mlblocks/discovery.py @@ -0,0 +1,263 @@ +# -*- coding: utf-8 -*- + +""" +Primitives and Pipelines discovery module. + +This module contains functions to load primitive and pipeline +annotations, as well as to configure how MLBlocks finds the +primitives and pipelines. +""" + +import json +import logging +import os +import sys + +import pkg_resources + +LOGGER = logging.getLogger(__name__) + +_PRIMITIVES_PATHS = [ + os.path.join(os.getcwd(), 'mlprimitives'), + os.path.join(sys.prefix, 'mlprimitives'), + os.path.join(os.getcwd(), 'mlblocks_primitives'), # legacy + os.path.join(sys.prefix, 'mlblocks_primitives'), # legacy +] +_PIPELINES_PATHS = [ + os.path.join(os.getcwd(), 'mlpipelines'), +] + + +def _add_lookup_path(path, paths): + """Add a new path to lookup. + + The new path will be inserted in the first place of the list, + so any element found in this new folder will take precedence + over any other element with the same name that existed in the + system before. + + Args: + path (str): + path to add + + Raises: + ValueError: + A `ValueError` will be raised if the path is not valid. + + """ + if path not in paths: + if not os.path.isdir(path): + raise ValueError('Invalid path: {}'.format(path)) + + paths.insert(0, os.path.abspath(path)) + return True + + +def add_primitives_path(path): + """Add a new path to look for primitives. + + The new path will be inserted in the first place of the list, + so any primitive found in this new folder will take precedence + over any other primitive with the same name that existed in the + system before. + + Args: + path (str): + path to add + + Raises: + ValueError: + A `ValueError` will be raised if the path is not valid. + """ + added = _add_lookup_path(path, _PRIMITIVES_PATHS) + if added: + LOGGER.debug('New primitives path added: %s', path) + + +def add_pipelines_path(path): + """Add a new path to look for pipelines. + + The new path will be inserted in the first place of the list, + so any primitive found in this new folder will take precedence + over any other pipeline with the same name that existed in the + system before. + + Args: + path (str): + path to add + + Raises: + ValueError: + A `ValueError` will be raised if the path is not valid. + """ + added = _add_lookup_path(path, _PIPELINES_PATHS) + if added: + LOGGER.debug('New pipelines path added: %s', path) + + +def _get_lookup_paths(entry_point): + """Get the list of folders where elements will be looked for. + + This list will include the value of any `entry_point` named `jsons_path` published under + the entry_point name. + + An example of such an entry point would be:: + + entry_points = { + 'mlprimitives': [ + 'jsons_path=some_module:SOME_VARIABLE' + ] + } + + where the module `some_module` contains a variable such as:: + + SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') + + Args: + entry_point: + The name of the `entry_point` to look for. + + Returns: + list: + The list of folders. + """ + lookup_paths = list() + entry_points = pkg_resources.iter_entry_points(entry_point) + for entry_point in entry_points: + if entry_point.name == 'jsons_path': + path = entry_point.load() + lookup_paths.append(path) + + return lookup_paths + + +def get_primitives_paths(): + """Get the list of folders where primitives will be looked for. + + This list will include the value of any `entry_point` named `jsons_path` published under + the `mlprimitives` name. + + An example of such an entry point would be:: + + entry_points = { + 'mlprimitives': [ + 'jsons_path=some_module:SOME_VARIABLE' + ] + } + + where the module `some_module` contains a variable such as:: + + SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') + + Returns: + list: + The list of folders. + """ + return _PRIMITIVES_PATHS + _get_lookup_paths('mlprimitives') + + +def get_pipelines_paths(): + """Get the list of folders where pipelines will be looked for. + + This list will include the value of any `entry_point` named `jsons_path` published under + the `mlpipelines` name. + + An example of such an entry point would be:: + + entry_points = { + 'mlpipelines': [ + 'jsons_path=some_module:SOME_VARIABLE' + ] + } + + where the module `some_module` contains a variable such as:: + + SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') + + Returns: + list: + The list of folders. + """ + return _PIPELINES_PATHS + _get_lookup_paths('mlpipelines') + + +def _load(name, paths): + """Locate and load the JSON annotation in any of the given paths. + + All the given paths will be scanned to find a JSON file with the given name, + and as soon as a JSON with the given name is found it is returned. + + Args: + name (str): + name of the JSON to look for. The name should not contain the + `.json` extension, as it will be added dynamically. + + Returns: + dict: + The content of the JSON annotation file loaded into a dict. + """ + for base_path in paths: + parts = name.split('.') + number_of_parts = len(parts) + + for folder_parts in range(number_of_parts): + folder = os.path.join(base_path, *parts[:folder_parts]) + filename = '.'.join(parts[folder_parts:]) + '.json' + json_path = os.path.join(folder, filename) + + if os.path.isfile(json_path): + with open(json_path, 'r') as json_file: + LOGGER.debug('Loading %s from %s', name, json_path) + return json.load(json_file) + + +def load_primitive(name): + """Locate and load the primitive JSON annotation. + + All the primitive paths will be scanned to find a JSON file with the given name, + and as soon as a JSON with the given name is found it is returned. + + Args: + name (str): + name of the JSON to look for. The name should not contain the + `.json` extension, as it will be added dynamically. + + Returns: + dict: + The content of the JSON annotation file loaded into a dict. + + Raises: + ValueError: + A `ValueError` will be raised if the primitive cannot be found. + """ + primitive = _load(name, get_primitives_paths()) + if not primitive: + raise ValueError("Unknown primitive: {}".format(name)) + + return primitive + + +def load_pipeline(name): + """Locate and load the pipeline JSON annotation. + + All the pipeline paths will be scanned to find a JSON file with the given name, + and as soon as a JSON with the given name is found it is returned. + + Args: + name (str): + name of the JSON to look for. The name should not contain the + `.json` extension, as it will be added dynamically. + + Returns: + dict: + The content of the JSON annotation file loaded into a dict. + + Raises: + ValueError: + A `ValueError` will be raised if the pipeline cannot be found. + """ + pipeline = _load(name, get_pipelines_paths()) + if not pipeline: + raise ValueError("Unknown pipeline: {}".format(name)) + + return pipeline diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index 80f5baa2..1ab4a557 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -5,7 +5,7 @@ import importlib import logging -from mlblocks.primitives import load_primitive +from mlblocks.discovery import load_primitive LOGGER = logging.getLogger(__name__) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 9a0a109e..dc12b41f 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -6,6 +6,7 @@ import logging from collections import Counter, OrderedDict +from mlblocks.discovery import load_pipeline from mlblocks.mlblock import MLBlock LOGGER = logging.getLogger(__name__) @@ -46,6 +47,12 @@ class MLPipeline(): output_names dictionary, as given when the instance was created. Args: + pipeline (str, list, dict or MLPipeline): + The pipeline argument accepts four different types with different interpretations: + * `str`: the name of the pipeline to search and load. + * `list`: the primitives list. + * `dict`: a complete pipeline specification. + * `MLPipeline`: another pipeline to be cloned. primitives (list): List with the names of the primitives that will compose this pipeline. init_params (dict): @@ -73,10 +80,9 @@ def _get_tunable_hyperparameters(self): return tunable - def __init__(self, primitives, init_params=None, input_names=None, output_names=None): - self.primitives = primitives - self.init_params = init_params or dict() - self.blocks = OrderedDict() + @staticmethod + def _build_blocks(primitives, init_params): + blocks = OrderedDict() block_names_count = Counter() for primitive in primitives: @@ -84,23 +90,67 @@ def __init__(self, primitives, init_params=None, input_names=None, output_names= block_names_count.update([primitive]) block_count = block_names_count[primitive] block_name = '{}#{}'.format(primitive, block_count) - block_params = self.init_params.get(block_name, dict()) + block_params = init_params.get(block_name, dict()) if not block_params: - block_params = self.init_params.get(primitive, dict()) + block_params = init_params.get(primitive, dict()) if block_params and block_count > 1: LOGGER.warning(("Non-numbered init_params are being used " "for more than one block %s."), primitive) block = MLBlock(primitive, **block_params) - self.blocks[block_name] = block + blocks[block_name] = block except Exception: LOGGER.exception("Exception caught building MLBlock %s", primitive) raise - self.input_names = input_names or dict() - self.output_names = output_names or dict() - self._tunable_hyperparameters = self._get_tunable_hyperparameters() + return blocks + + @staticmethod + def _get_pipeline_dict(pipeline, primitives): + + if isinstance(pipeline, dict): + return pipeline + + elif isinstance(pipeline, str): + return load_pipeline(pipeline) + + elif isinstance(pipeline, MLPipeline): + return pipeline.to_dict() + + elif isinstance(pipeline, list): + if primitives is not None: + raise ValueError('if `pipeline` is a `list`, `primitives` must be `None`') + + return {'primitives': pipeline} + + elif pipeline is None: + if primitives is None: + raise ValueError('Either `pipeline` or `primitives` must be not `None`.') + + return dict() + + def __init__(self, pipeline=None, primitives=None, init_params=None, + input_names=None, output_names=None): + + pipeline = self._get_pipeline_dict(pipeline, primitives) + + self.primitives = primitives or pipeline['primitives'] + self.init_params = init_params or pipeline.get('init_params', dict()) + self.blocks = self._build_blocks(self.primitives, self.init_params) + + self.input_names = input_names or pipeline.get('input_names', dict()) + self.output_names = output_names or pipeline.get('output_names', dict()) + + tunable = pipeline.get('tunable_hyperparameters') + if tunable is not None: + self._tunable_hyperparameters = tunable + else: + self._tunable_hyperparameters = self._get_tunable_hyperparameters() + + hyperparameters = pipeline.get('hyperparameters') + if hyperparameters: + self.set_hyperparameters(hyperparameters) def get_tunable_hyperparameters(self): """Get the tunable hyperparamters of each block. diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py deleted file mode 100644 index f2300f67..00000000 --- a/mlblocks/primitives.py +++ /dev/null @@ -1,116 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Primitives module. - -This module contains functions to load primitive annotations, -as well as to configure how MLBlocks finds the primitives. -""" - -import json -import logging -import os -import sys - -import pkg_resources - -LOGGER = logging.getLogger(__name__) - -_PRIMITIVES_PATHS = [ - os.path.join(os.getcwd(), 'mlprimitives'), - os.path.join(sys.prefix, 'mlprimitives'), - os.path.join(os.getcwd(), 'mlblocks_primitives'), # legacy - os.path.join(sys.prefix, 'mlblocks_primitives'), # legacy -] - - -def add_primitives_path(path): - """Add a new path to look for primitives. - - The new path will be inserted in the first place of the list, - so any primitive found in this new folder will take precedence - over any other primitive with the same name that existed in the - system before. - - Args: - path (str): path to add - - Raises: - ValueError: A `ValueError` will be raised if the path is not valid. - - """ - if path not in _PRIMITIVES_PATHS: - if not os.path.isdir(path): - raise ValueError('Invalid path: {}'.format(path)) - - LOGGER.debug('Adding new primitives path %s', path) - _PRIMITIVES_PATHS.insert(0, os.path.abspath(path)) - - -def get_primitives_paths(): - """Get the list of folders where the primitives will be looked for. - - This list will include the value of any `entry_point` named `jsons_path` published under - the name `mlprimitives`. - - An example of such an entry point would be:: - - entry_points = { - 'mlprimitives': [ - 'jsons_path=some_module:SOME_VARIABLE' - ] - } - - where the module `some_module` contains a variable such as:: - - SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') - - Returns: - list: - The list of folders. - """ - primitives_paths = list() - entry_points = pkg_resources.iter_entry_points('mlprimitives') - for entry_point in entry_points: - if entry_point.name == 'jsons_path': - path = entry_point.load() - primitives_paths.append(path) - - return _PRIMITIVES_PATHS + primitives_paths - - -def load_primitive(name): - """Locate and load the JSON annotation of the given primitive. - - All the paths found in PRIMTIVE_PATHS will be scanned to find a JSON file - with the given name, and as soon as a JSON with the given name is found it - is returned. - - Args: - name (str): name of the primitive to look for. The name should - correspond to the primitive, not to the filename, as the - `.json` extension will be added dynamically. - - Returns: - dict: - The content of the JSON annotation file loaded into a dict. - - Raises: - ValueError: A `ValueError` will be raised if the primitive cannot be - found. - """ - for base_path in get_primitives_paths(): - parts = name.split('.') - number_of_parts = len(parts) - - for folder_parts in range(number_of_parts): - folder = os.path.join(base_path, *parts[:folder_parts]) - filename = '.'.join(parts[folder_parts:]) + '.json' - json_path = os.path.join(folder, filename) - - if os.path.isfile(json_path): - with open(json_path, 'r') as json_file: - LOGGER.debug('Loading primitive %s from %s', name, json_path) - return json.load(json_file) - - raise ValueError("Unknown primitive: {}".format(name)) diff --git a/tests/features/test_pipeline_loading.py b/tests/features/test_pipeline_loading.py new file mode 100644 index 00000000..bc344d63 --- /dev/null +++ b/tests/features/test_pipeline_loading.py @@ -0,0 +1,106 @@ +from unittest import TestCase +from unittest.mock import Mock + +from mlblocks import MLPipeline + + +class TestMLPipeline(TestCase): + + def test_dict(self): + pipeline_dict = { + 'primitives': [ + 'sklearn.ensemble.RandomForestClassifier' + ], + 'init_params': { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + }, + 'input_names': { + 'sklearn.ensemble.RandomForest#1': { + 'X': 'X1' + } + }, + 'output_names': { + 'sklearn.ensemble.RandomForest#1': { + 'y': 'y1' + } + } + } + + pipeline = MLPipeline(pipeline_dict) + + assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier'] + assert pipeline.init_params == { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + } + assert pipeline.input_names == { + 'sklearn.ensemble.RandomForest#1': { + 'X': 'X1' + } + } + assert pipeline.output_names == { + 'sklearn.ensemble.RandomForest#1': { + 'y': 'y1' + } + } + + def test_list(self): + primitives = [ + 'sklearn.ensemble.RandomForestClassifier' + ] + init_params = { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + } + + pipeline = MLPipeline(primitives, init_params=init_params) + + assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier'] + assert pipeline.init_params == { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + } + + def test_none(self): + primitives = [ + 'sklearn.ensemble.RandomForestClassifier' + ] + init_params = { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + } + + pipeline = MLPipeline(primitives=primitives, init_params=init_params) + + assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier'] + assert pipeline.init_params == { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + } + + def test_mlpipeline(self): + primitives = [ + 'sklearn.ensemble.RandomForestClassifier' + ] + init_params = { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + } + + pipeline = MLPipeline(primitives=primitives, init_params=init_params) + pipeline2 = MLPipeline(pipeline) + + assert pipeline2.primitives == ['sklearn.ensemble.RandomForestClassifier'] + assert pipeline2.init_params == { + 'sklearn.ensemble.RandomForest#1': { + 'n_estimators': 500 + } + } diff --git a/tests/test_primitives.py b/tests/test_discovery.py similarity index 60% rename from tests/test_primitives.py rename to tests/test_discovery.py index 1afd17b6..3a7c3321 100644 --- a/tests/test_primitives.py +++ b/tests/test_discovery.py @@ -9,57 +9,57 @@ import pytest from pkg_resources import Distribution, EntryPoint -from mlblocks import primitives +from mlblocks import discovery FAKE_MLPRIMITIVES_PATH = 'this/is/a/fake' -@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) +@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) def test_add_primitives_path_do_nothing(): - primitives.add_primitives_path('a') + discovery.add_primitives_path('a') - assert primitives._PRIMITIVES_PATHS == ['a', 'b'] + assert discovery._PRIMITIVES_PATHS == ['a', 'b'] -@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) +@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) def test_add_primitives_path_exception(): invalid_path = str(uuid.uuid4()) with pytest.raises(ValueError): - primitives.add_primitives_path(invalid_path) + discovery.add_primitives_path(invalid_path) -@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) +@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) def test_add_primitives_path(): - primitives.add_primitives_path('tests') + discovery.add_primitives_path('tests') expected_path = os.path.abspath('tests') - assert primitives._PRIMITIVES_PATHS == [expected_path, 'a', 'b'] + assert discovery._PRIMITIVES_PATHS == [expected_path, 'a', 'b'] -@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) -@patch('mlblocks.primitives.pkg_resources.iter_entry_points') +@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) +@patch('mlblocks.discovery.pkg_resources.iter_entry_points') def test_get_primitives_paths_no_entry_points(iep_mock): # setup iep_mock.return_value == [] # run - paths = primitives.get_primitives_paths() + paths = discovery.get_primitives_paths() # assert assert paths == ['a', 'b'] iep_mock.assert_called_once_with('mlprimitives') -@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) -@patch('mlblocks.primitives.pkg_resources.iter_entry_points') +@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) +@patch('mlblocks.discovery.pkg_resources.iter_entry_points') def test_get_primitives_paths_entry_points(iep_mock): # setup something_else_ep = EntryPoint('something_else', 'mlblocks.__version__') jsons_path_ep = EntryPoint( 'jsons_path', - 'tests.test_primitives', + 'tests.test_discovery', attrs=['FAKE_MLPRIMITIVES_PATH'], dist=Distribution() ) @@ -69,7 +69,7 @@ def test_get_primitives_paths_entry_points(iep_mock): ] # run - paths = primitives.get_primitives_paths() + paths = discovery.get_primitives_paths() # assert expected = [ @@ -82,10 +82,10 @@ def test_get_primitives_paths_entry_points(iep_mock): iep_mock.assert_called_once_with('mlprimitives') -@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b']) +@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) def test_load_primitive_value_error(): with pytest.raises(ValueError): - primitives.load_primitive('invalid.primitive') + discovery.load_primitive('invalid.primitive') def test_load_primitive_success(): @@ -95,11 +95,11 @@ def test_load_primitive_success(): } with tempfile.TemporaryDirectory() as tempdir: - primitives.add_primitives_path(tempdir) + discovery.add_primitives_path(tempdir) primitive_path = os.path.join(tempdir, 'temp.primitive.json') with open(primitive_path, 'w') as primitive_file: json.dump(primitive, primitive_file, indent=4) - loaded = primitives.load_primitive('temp.primitive') + loaded = discovery.load_primitive('temp.primitive') assert primitive == loaded diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index 2fa6d097..741be194 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -39,7 +39,11 @@ def test___init__(self, mlblock_mock, logger_mock): } expected_input_names = input_names.copy() - mlpipeline = MLPipeline(primitives, init_params, input_names) + mlpipeline = MLPipeline( + primitives=primitives, + init_params=init_params, + input_names=input_names + ) assert mlpipeline.primitives == expected_primitives assert mlpipeline.init_params == expected_init_params From eb36fcb12f79401c776b0269be35b7c64e1ea22d Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 21 May 2019 13:54:02 -0400 Subject: [PATCH 023/160] Fix docs --- docs/advanced_usage/pipelines.rst | 2 +- docs/api/mlblocks.discovery.rst | 5 +++ docs/api/mlblocks.primitives.rst | 5 --- docs/getting_started/quickstart.rst | 2 +- docs/index.rst | 2 +- mlblocks/discovery.py | 34 +++++++-------- mlblocks/mlblock.py | 31 +++++++------- mlblocks/mlpipeline.py | 57 +++++++++++++------------ tests/features/test_pipeline_loading.py | 1 - 9 files changed, 70 insertions(+), 69 deletions(-) create mode 100644 docs/api/mlblocks.discovery.rst delete mode 100644 docs/api/mlblocks.primitives.rst diff --git a/docs/advanced_usage/pipelines.rst b/docs/advanced_usage/pipelines.rst index cc7ccc49..33d57cdc 100644 --- a/docs/advanced_usage/pipelines.rst +++ b/docs/advanced_usage/pipelines.rst @@ -86,7 +86,7 @@ This can be done by passing an extra dictionary to the MLPipeline when it is cre 'n_estimators': 100 } } - pipeline = MLPipeline(primitives, init_params) + pipeline = MLPipeline(primitives, init_params=init_params) This dictionary must have as keys the name of the blocks that the arguments belong to, and as values the dictionary that contains the argument names and their values. diff --git a/docs/api/mlblocks.discovery.rst b/docs/api/mlblocks.discovery.rst new file mode 100644 index 00000000..c9109130 --- /dev/null +++ b/docs/api/mlblocks.discovery.rst @@ -0,0 +1,5 @@ +mlblocks.discovery +================== + +.. automodule:: mlblocks.discovery + :members: diff --git a/docs/api/mlblocks.primitives.rst b/docs/api/mlblocks.primitives.rst deleted file mode 100644 index d625c774..00000000 --- a/docs/api/mlblocks.primitives.rst +++ /dev/null @@ -1,5 +0,0 @@ -mlblocks.primitives -=================== - -.. automodule:: mlblocks.primitives - :members: diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst index 2115fcef..c3edf475 100644 --- a/docs/getting_started/quickstart.rst +++ b/docs/getting_started/quickstart.rst @@ -38,7 +38,7 @@ Optionally, specific `hyperparameters`_ can be also set by specifying them in a 'n_estimators': 100 } } - pipeline = MLPipeline(primitives, hyperparameters) + pipeline = MLPipeline(primitives, init_params=hyperparameters) Once the pipeline has been instantiated, we can easily see what `hyperparameters`_ have been set for each block, by calling the `get_hyperparameters method`_. diff --git a/docs/index.rst b/docs/index.rst index 2bb4c5a9..c3655b3c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -74,7 +74,7 @@ integrate with deep learning libraries. api/mlblocks api/mlblocks.datasets - api/mlblocks.primitives + api/mlblocks.discovery .. toctree:: :caption: Resources diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py index 78f12021..1f952b81 100644 --- a/mlblocks/discovery.py +++ b/mlblocks/discovery.py @@ -42,7 +42,7 @@ def _add_lookup_path(path, paths): Raises: ValueError: - A `ValueError` will be raised if the path is not valid. + A ``ValueError`` will be raised if the path is not valid. """ if path not in paths: @@ -67,7 +67,7 @@ def add_primitives_path(path): Raises: ValueError: - A `ValueError` will be raised if the path is not valid. + A ``ValueError`` will be raised if the path is not valid. """ added = _add_lookup_path(path, _PRIMITIVES_PATHS) if added: @@ -88,7 +88,7 @@ def add_pipelines_path(path): Raises: ValueError: - A `ValueError` will be raised if the path is not valid. + A ``ValueError`` will be raised if the path is not valid. """ added = _add_lookup_path(path, _PIPELINES_PATHS) if added: @@ -98,7 +98,7 @@ def add_pipelines_path(path): def _get_lookup_paths(entry_point): """Get the list of folders where elements will be looked for. - This list will include the value of any `entry_point` named `jsons_path` published under + This list will include the value of any ``entry_point`` named ``jsons_path`` published under the entry_point name. An example of such an entry point would be:: @@ -109,13 +109,13 @@ def _get_lookup_paths(entry_point): ] } - where the module `some_module` contains a variable such as:: + where the module ``some_module`` contains a variable such as:: SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') Args: entry_point: - The name of the `entry_point` to look for. + The name of the ``entry_point`` to look for. Returns: list: @@ -134,8 +134,8 @@ def _get_lookup_paths(entry_point): def get_primitives_paths(): """Get the list of folders where primitives will be looked for. - This list will include the value of any `entry_point` named `jsons_path` published under - the `mlprimitives` name. + This list will include the value of any ``entry_point`` named ``jsons_path`` published under + the ``mlprimitives`` name. An example of such an entry point would be:: @@ -145,7 +145,7 @@ def get_primitives_paths(): ] } - where the module `some_module` contains a variable such as:: + where the module ``some_module`` contains a variable such as:: SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') @@ -159,8 +159,8 @@ def get_primitives_paths(): def get_pipelines_paths(): """Get the list of folders where pipelines will be looked for. - This list will include the value of any `entry_point` named `jsons_path` published under - the `mlpipelines` name. + This list will include the value of any ``entry_point`` named ``jsons_path`` published under + the ``mlpipelines`` name. An example of such an entry point would be:: @@ -170,7 +170,7 @@ def get_pipelines_paths(): ] } - where the module `some_module` contains a variable such as:: + where the module ``some_module`` contains a variable such as:: SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') @@ -190,7 +190,7 @@ def _load(name, paths): Args: name (str): name of the JSON to look for. The name should not contain the - `.json` extension, as it will be added dynamically. + ``.json`` extension, as it will be added dynamically. Returns: dict: @@ -220,7 +220,7 @@ def load_primitive(name): Args: name (str): name of the JSON to look for. The name should not contain the - `.json` extension, as it will be added dynamically. + ``.json`` extension, as it will be added dynamically. Returns: dict: @@ -228,7 +228,7 @@ def load_primitive(name): Raises: ValueError: - A `ValueError` will be raised if the primitive cannot be found. + A ``ValueError`` will be raised if the primitive cannot be found. """ primitive = _load(name, get_primitives_paths()) if not primitive: @@ -246,7 +246,7 @@ def load_pipeline(name): Args: name (str): name of the JSON to look for. The name should not contain the - `.json` extension, as it will be added dynamically. + ``.json`` extension, as it will be added dynamically. Returns: dict: @@ -254,7 +254,7 @@ def load_pipeline(name): Raises: ValueError: - A `ValueError` will be raised if the pipeline cannot be found. + A ``ValueError`` will be raised if the pipeline cannot be found. """ pipeline = _load(name, get_pipelines_paths()) if not pipeline: diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index 1ab4a557..66bbf8fe 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -30,15 +30,16 @@ class MLBlock(): primitive (object): the actual function or instance which this MLBlock wraps. fit_args (dict): - specification of the arguments expected by the `fit` method. + specification of the arguments expected by the ``fit`` method. fit_method (str): - name of the primitive method to call on `fit`. `None` if the primitive is a function. + name of the primitive method to call on ``fit``. ``None`` if the + primitive is a function. produce_args (dict): - specification of the arguments expected by the `predict` method. + specification of the arguments expected by the ``predict`` method. produce_output (dict): - specification of the outputs of the `produce` method. + specification of the outputs of the ``produce`` method. produce_method (str): - name of the primitive method to call on `produce`. `None` if the primitive is a + name of the primitive method to call on ``produce``. ``None`` if the primitive is a function. Args: @@ -46,19 +47,19 @@ class MLBlock(): Name given to this MLBlock. **kwargs: Any additional arguments that will be used as hyperparameters or passed to the - `fit` or `produce` methods. + ``fit`` or ``produce`` methods. Raises: TypeError: - A `TypeError` is raised if a required argument is not found within the `kwargs` + A ``TypeError`` is raised if a required argument is not found within the ``kwargs`` or if an unexpected argument has been given. """ # pylint: disable=too-many-instance-attributes def _extract_params(self, kwargs, hyperparameters): """Extract init, fit and produce params from kwargs. - The `init_params`, `fit_params` and `produce_params` are extracted - from the passed `kwargs` taking the metadata hyperparameters as a + The ``init_params``, ``fit_params`` and ``produce_params`` are extracted + from the passed ``kwargs`` taking the metadata hyperparameters as a reference. During this extraction, make sure that all the required hyperparameters @@ -66,15 +67,15 @@ def _extract_params(self, kwargs, hyperparameters): Args: kwargs (dict): - dict containing the Keyword arguments that have been passed to the `__init__` + dict containing the Keyword arguments that have been passed to the ``__init__`` method upon initialization. hyperparameters (dict): hyperparameters dictionary, as found in the JSON annotation. Raises: TypeError: - A `TypeError` is raised if a required argument is not found in the `kwargs` dict, - or if an unexpected argument has been given. + A ``TypeError`` is raised if a required argument is not found in the + ``kwargs`` dict, or if an unexpected argument has been given. """ init_params = dict() fit_params = dict() @@ -262,7 +263,7 @@ def _get_method_kwargs(self, kwargs, method_args): def fit(self, **kwargs): """Call the fit method of the primitive. - The given keyword arguments will be passed directly to the `fit` + The given keyword arguments will be passed directly to the ``fit`` method of the primitive instance specified in the JSON annotation. If any of the arguments expected by the produce method had been @@ -277,7 +278,7 @@ def fit(self, **kwargs): Raises: TypeError: - A `TypeError` might be raised if any argument not expected by the primitive fit + A ``TypeError`` might be raised if any argument not expected by the primitive fit method is given. """ if self.fit_method is not None: @@ -290,7 +291,7 @@ def produce(self, **kwargs): """Call the primitive function, or the predict method of the primitive. The given keyword arguments will be passed directly to the primitive, - if it is a simple function, or to the `produce` method of the + if it is a simple function, or to the ``produce`` method of the primitive instance specified in the JSON annotation, if it is a class. If any of the arguments expected by the fit method had been given diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index dc12b41f..b73d96b9 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -58,7 +58,7 @@ class MLPipeline(): init_params (dict): dictionary containing initialization arguments to be passed when creating the MLBlocks instances. The dictionary keys must be the corresponding primitive names - and the values must be another dictionary that will be passed as `**kargs` to the + and the values must be another dictionary that will be passed as ``**kargs`` to the MLBlock instance. input_names (dict): dictionary that maps input variable names with the actual names expected by each @@ -191,7 +191,7 @@ def _get_block_args(self, block_name, block_args, context): """Get the arguments expected by the block method from the context. The arguments will be taken from the context using both the method - arguments specification and the `input_names` given when the pipeline + arguments specification and the ``input_names`` given when the pipeline was created. Args: @@ -245,7 +245,7 @@ def _extract_outputs(self, block_name, outputs, block_outputs): return output_dict def _get_block_name(self, index): - """Get the name of the block in the `index` position.""" + """Get the name of the block in the ``index`` position.""" return list(self.blocks.keys())[index] def _get_output_spec(self, output): @@ -338,14 +338,14 @@ def _get_output(self, output_variable, context): def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): """Fit the blocks of this pipeline. - Sequentially call the `fit` and the `produce` methods of each block, - capturing the outputs each `produce` method before calling the `fit` + Sequentially call the ``fit`` and the ``produce`` methods of each block, + capturing the outputs each ``produce`` method before calling the ``fit`` method of the next one. During the whole process a context dictionary is built, where both the - passed arguments and the captured outputs of the `produce` methods - are stored, and from which the arguments for the next `fit` and - `produce` calls will be taken. + passed arguments and the captured outputs of the ``produce`` methods + are stored, and from which the arguments for the next ``fit`` and + ``produce`` calls will be taken. Args: X: @@ -451,12 +451,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): def predict(self, X=None, output_=None, start_=None, **kwargs): """Produce predictions using the blocks of this pipeline. - Sequentially call the `produce` method of each block, capturing the + Sequentially call the ``produce`` method of each block, capturing the outputs before calling the next one. During the whole process a context dictionary is built, where both the - passed arguments and the captured outputs of the `produce` methods - are stored, and from which the arguments for the next `produce` calls + passed arguments and the captured outputs of the ``produce`` methods + are stored, and from which the arguments for the next ``produce`` calls will be taken. Args: @@ -550,7 +550,7 @@ def predict(self, X=None, output_=None, start_=None, **kwargs): def to_dict(self): """Return all the details of this MLPipeline in a dict. - The dict structure contains all the `__init__` arguments of the + The dict structure contains all the ``__init__`` arguments of the MLPipeline, as well as the current hyperparameter values and the specification of the tunable_hyperparameters:: @@ -599,7 +599,7 @@ def to_dict(self): def save(self, path): """Save the specification of this MLPipeline in a JSON file. - The content of the JSON file is the dict returned by the `to_dict` method. + The content of the JSON file is the dict returned by the ``to_dict`` method. Args: path (str): @@ -612,7 +612,7 @@ def save(self, path): def from_dict(cls, metadata): """Create a new MLPipeline from a dict specification. - The dict structure is the same as the one created by the `to_dict` method. + The dict structure is the same as the one created by the ``to_dict`` method. Args: metadata (dict): @@ -623,29 +623,30 @@ def from_dict(cls, metadata): A new MLPipeline instance with the details found in the given specification dictionary. """ - hyperparameters = metadata.get('hyperparameters') - tunable = metadata.get('tunable_hyperparameters') + # hyperparameters = metadata.get('hyperparameters') + # tunable = metadata.get('tunable_hyperparameters') - pipeline = cls( - metadata['primitives'], - metadata.get('init_params'), - metadata.get('input_names'), - metadata.get('output_names'), - ) + # pipeline = cls( + # metadata['primitives'], + # metadata.get('init_params'), + # metadata.get('input_names'), + # metadata.get('output_names'), + # ) - if hyperparameters: - pipeline.set_hyperparameters(hyperparameters) + # if hyperparameters: + # pipeline.set_hyperparameters(hyperparameters) - if tunable is not None: - pipeline._tunable_hyperparameters = tunable + # if tunable is not None: + # pipeline._tunable_hyperparameters = tunable - return pipeline + # return pipeline + return cls(metadata) @classmethod def load(cls, path): """Create a new MLPipeline from a JSON specification. - The JSON file format is the same as the one created by the `to_dict` method. + The JSON file format is the same as the one created by the ``to_dict`` method. Args: path (str): diff --git a/tests/features/test_pipeline_loading.py b/tests/features/test_pipeline_loading.py index bc344d63..4b363d07 100644 --- a/tests/features/test_pipeline_loading.py +++ b/tests/features/test_pipeline_loading.py @@ -1,5 +1,4 @@ from unittest import TestCase -from unittest.mock import Mock from mlblocks import MLPipeline From d97ad54e547665488fd2dcea21ec8369d95fcb7f Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 21 May 2019 13:56:51 -0400 Subject: [PATCH 024/160] Update the readme to the latest API changes --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index fb8d3885..cd454b73 100644 --- a/README.md +++ b/README.md @@ -81,10 +81,10 @@ them to the `MLPipeline` class. >>> pipeline = MLPipeline(primitives) ``` -Optionally, specific hyperparameters can be also set by specifying them in a dictionary: +Optionally, specific initialization arguments can be also set by specifying them in a dictionary: ```python ->>> hyperparameters = { +>>> init_params = { ... 'skimage.feature.hog': { ... 'multichannel': True, ... 'visualize': False @@ -93,7 +93,7 @@ Optionally, specific hyperparameters can be also set by specifying them in a dic ... 'n_estimators': 100, ... } ... } ->>> pipeline = MLPipeline(primitives, hyperparameters) +>>> pipeline = MLPipeline(primitives, init_params=init_params) ``` If you can see which hyperparameters a particular pipeline is using, you can do so by calling From 221cfb82ac9f6f7cd413043429916a5528567b0e Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 21 May 2019 16:24:00 -0400 Subject: [PATCH 025/160] Remove commented code --- mlblocks/mlpipeline.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index b73d96b9..ce31780f 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -623,23 +623,6 @@ def from_dict(cls, metadata): A new MLPipeline instance with the details found in the given specification dictionary. """ - # hyperparameters = metadata.get('hyperparameters') - # tunable = metadata.get('tunable_hyperparameters') - - # pipeline = cls( - # metadata['primitives'], - # metadata.get('init_params'), - # metadata.get('input_names'), - # metadata.get('output_names'), - # ) - - # if hyperparameters: - # pipeline.set_hyperparameters(hyperparameters) - - # if tunable is not None: - # pipeline._tunable_hyperparameters = tunable - - # return pipeline return cls(metadata) @classmethod From 197c47f3cc7bbe6e683a971866bbd9e52b9821d9 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 21 May 2019 17:05:49 -0400 Subject: [PATCH 026/160] Add instructions to install MLPrimitives --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index cd454b73..01629dc8 100644 --- a/README.md +++ b/README.md @@ -58,11 +58,26 @@ make install For development, you can use `make install-develop` instead in order to install all the required dependencies for testing and code linting. +## MLPrimitives + +In order to be usable, MLBlocks requires a compatible primitives library. + +The official library, required in order to follow the following MLBlocks tutorial, +is [MLPrimitives](https://github.com/HDI-Project/MLPrimitives), which you can install +with this command: + +```bash +pip install mlprimitives +``` + # Usage Example Below there is a short example about how to use MLBlocks to create a simple pipeline, fit it using demo data and use it to make predictions. +Please make sure to having installed [MLPrimitives](https://github.com/HDI-Project/MLPrimitives) +before following it. + For advance usage and more detailed explanation about each component, please have a look at the [documentation](https://HDI-Project.github.io/MLBlocks) From d451c7c3d2f9eb4972f8a1c38edbb468410b7d44 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 22 May 2019 15:12:57 -0400 Subject: [PATCH 027/160] Address PR feedback --- docs/getting_started/quickstart.rst | 7 ++++--- mlblocks/discovery.py | 9 +++++++++ mlblocks/mlpipeline.py | 11 +++++------ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst index c3edf475..2887da05 100644 --- a/docs/getting_started/quickstart.rst +++ b/docs/getting_started/quickstart.rst @@ -29,16 +29,17 @@ them to the `MLPipeline class`_: ] pipeline = MLPipeline(primitives) -Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary: +Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary and +passing them as the ``init_params`` argument: .. ipython:: python - hyperparameters = { + init_params = { 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': 100 } } - pipeline = MLPipeline(primitives, init_params=hyperparameters) + pipeline = MLPipeline(primitives, init_params=init_params) Once the pipeline has been instantiated, we can easily see what `hyperparameters`_ have been set for each block, by calling the `get_hyperparameters method`_. diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py index 1f952b81..51ff13cd 100644 --- a/mlblocks/discovery.py +++ b/mlblocks/discovery.py @@ -39,11 +39,16 @@ def _add_lookup_path(path, paths): Args: path (str): path to add + paths (list): + list where the new path will be added. Raises: ValueError: A ``ValueError`` will be raised if the path is not valid. + Returns: + bool: + Whether the new path was added or not. """ if path not in paths: if not os.path.isdir(path): @@ -52,6 +57,8 @@ def _add_lookup_path(path, paths): paths.insert(0, os.path.abspath(path)) return True + return False + def add_primitives_path(path): """Add a new path to look for primitives. @@ -191,6 +198,8 @@ def _load(name, paths): name (str): name of the JSON to look for. The name should not contain the ``.json`` extension, as it will be added dynamically. + paths (list): + list of paths where the primitives will be looked for. Returns: dict: diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index ce31780f..b31502ea 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -80,19 +80,18 @@ def _get_tunable_hyperparameters(self): return tunable - @staticmethod - def _build_blocks(primitives, init_params): + def _build_blocks(self): blocks = OrderedDict() block_names_count = Counter() - for primitive in primitives: + for primitive in self.primitives: try: block_names_count.update([primitive]) block_count = block_names_count[primitive] block_name = '{}#{}'.format(primitive, block_count) - block_params = init_params.get(block_name, dict()) + block_params = self.init_params.get(block_name, dict()) if not block_params: - block_params = init_params.get(primitive, dict()) + block_params = self.init_params.get(primitive, dict()) if block_params and block_count > 1: LOGGER.warning(("Non-numbered init_params are being used " "for more than one block %s."), primitive) @@ -137,7 +136,7 @@ def __init__(self, pipeline=None, primitives=None, init_params=None, self.primitives = primitives or pipeline['primitives'] self.init_params = init_params or pipeline.get('init_params', dict()) - self.blocks = self._build_blocks(self.primitives, self.init_params) + self.blocks = self._build_blocks() self.input_names = input_names or pipeline.get('input_names', dict()) self.output_names = output_names or pipeline.get('output_names', dict()) From 8b2b7aaecd72637d9769bfb9ad94025f242e2872 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 22 May 2019 16:16:27 -0400 Subject: [PATCH 028/160] rename mlprimitives.jsons_path to mlblocks.primitives and support multiple paths --- mlblocks/discovery.py | 48 +++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py index 51ff13cd..b5ca840d 100644 --- a/mlblocks/discovery.py +++ b/mlblocks/discovery.py @@ -102,17 +102,17 @@ def add_pipelines_path(path): LOGGER.debug('New pipelines path added: %s', path) -def _get_lookup_paths(entry_point): - """Get the list of folders where elements will be looked for. +def _load_entry_points(entry_point_name, entry_point_group='mlblocks'): + """Get a list of folders from entry points. - This list will include the value of any ``entry_point`` named ``jsons_path`` published under - the entry_point name. + This list will include the value of any entry point named after the given + ``entry_point_name`` published under the given ``entry_point_group``. An example of such an entry point would be:: entry_points = { - 'mlprimitives': [ - 'jsons_path=some_module:SOME_VARIABLE' + 'mlblocks': [ + 'primitives=some_module:SOME_VARIABLE' ] } @@ -129,11 +129,14 @@ def _get_lookup_paths(entry_point): The list of folders. """ lookup_paths = list() - entry_points = pkg_resources.iter_entry_points(entry_point) + entry_points = pkg_resources.iter_entry_points(entry_point_group) for entry_point in entry_points: - if entry_point.name == 'jsons_path': - path = entry_point.load() - lookup_paths.append(path) + if entry_point.name == entry_point_name: + paths = entry_point.load() + if isinstance(paths, str): + lookup_paths.append(paths) + elif isinstance(paths, (list, tuple)): + lookup_paths.extend(paths) return lookup_paths @@ -141,14 +144,18 @@ def _get_lookup_paths(entry_point): def get_primitives_paths(): """Get the list of folders where primitives will be looked for. - This list will include the value of any ``entry_point`` named ``jsons_path`` published under - the ``mlprimitives`` name. + This list will include the values of all the entry points named ``primitives`` + published under the entry point group ``mlblocks``. + + Also, for backwards compatibility reasons, the paths from the entry points + named ``jsons_path`` published under the ``mlprimitives`` group will also + be included. An example of such an entry point would be:: entry_points = { - 'mlprimitives': [ - 'jsons_path=some_module:SOME_VARIABLE' + 'mlblocks': [ + 'primitives=some_module:SOME_VARIABLE' ] } @@ -160,20 +167,21 @@ def get_primitives_paths(): list: The list of folders. """ - return _PRIMITIVES_PATHS + _get_lookup_paths('mlprimitives') + paths = _load_entry_points('primitives') + _load_entry_points('jsons_path', 'mlprimitives') + return _PRIMITIVES_PATHS + paths def get_pipelines_paths(): """Get the list of folders where pipelines will be looked for. - This list will include the value of any ``entry_point`` named ``jsons_path`` published under - the ``mlpipelines`` name. + This list will include the values of all the entry points named ``pipelines`` + published under the entry point group ``mlblocks``. An example of such an entry point would be:: entry_points = { - 'mlpipelines': [ - 'jsons_path=some_module:SOME_VARIABLE' + 'mlblocks': [ + 'pipelines=some_module:SOME_VARIABLE' ] } @@ -185,7 +193,7 @@ def get_pipelines_paths(): list: The list of folders. """ - return _PIPELINES_PATHS + _get_lookup_paths('mlpipelines') + return _PIPELINES_PATHS + _load_entry_points('pipelines') def _load(name, paths): From cc012b013b27f8f301eda8adc8edcc2a79a37c57 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 22 May 2019 16:16:41 -0400 Subject: [PATCH 029/160] Add unit tests for mlblocks discovery --- tests/test_discovery.py | 151 ++++++++++++++++++++++++++++++++-------- 1 file changed, 122 insertions(+), 29 deletions(-) diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 3a7c3321..59bd4404 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -4,7 +4,7 @@ import os import tempfile import uuid -from unittest.mock import patch +from unittest.mock import call, patch import pytest from pkg_resources import Distribution, EntryPoint @@ -14,92 +14,185 @@ FAKE_MLPRIMITIVES_PATH = 'this/is/a/fake' -@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) -def test_add_primitives_path_do_nothing(): - discovery.add_primitives_path('a') +def test__add_lookup_path_do_nothing(): + paths = ['a', 'b'] + discovery._add_lookup_path('a', paths) - assert discovery._PRIMITIVES_PATHS == ['a', 'b'] + assert paths == ['a', 'b'] -@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) -def test_add_primitives_path_exception(): +def test__add_lookup_path_exception(): + paths = ['a', 'b'] invalid_path = str(uuid.uuid4()) with pytest.raises(ValueError): - discovery.add_primitives_path(invalid_path) + discovery._add_lookup_path(invalid_path, paths) + + +def test__add_lookup_path(): + paths = ['a', 'b'] + discovery._add_lookup_path('tests', paths) + + expected_path = os.path.abspath('tests') + + assert paths == [expected_path, 'a', 'b'] @patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) def test_add_primitives_path(): - discovery.add_primitives_path('tests') + discovery.add_primitives_path(os.path.abspath('tests')) expected_path = os.path.abspath('tests') - assert discovery._PRIMITIVES_PATHS == [expected_path, 'a', 'b'] +@patch('mlblocks.discovery._PIPELINES_PATHS', new=['a', 'b']) +def test_add_pipelines_path(): + discovery.add_pipelines_path('tests') + + expected_path = os.path.abspath('tests') + assert discovery._PIPELINES_PATHS == [expected_path, 'a', 'b'] + + @patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) @patch('mlblocks.discovery.pkg_resources.iter_entry_points') -def test_get_primitives_paths_no_entry_points(iep_mock): +def test__load_entry_points_no_entry_points(iep_mock): # setup iep_mock.return_value == [] # run - paths = discovery.get_primitives_paths() + paths = discovery._load_entry_points('jsons_path', 'mlprimitives') # assert - assert paths == ['a', 'b'] - iep_mock.assert_called_once_with('mlprimitives') + assert paths == [] + expected_calls = [ + call('mlprimitives'), + ] + assert iep_mock.call_args_list == expected_calls -@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) @patch('mlblocks.discovery.pkg_resources.iter_entry_points') -def test_get_primitives_paths_entry_points(iep_mock): +def test__load_entry_points_entry_points(iep_mock): # setup something_else_ep = EntryPoint('something_else', 'mlblocks.__version__') - jsons_path_ep = EntryPoint( - 'jsons_path', + primitives_ep = EntryPoint( + 'primitives', 'tests.test_discovery', attrs=['FAKE_MLPRIMITIVES_PATH'], dist=Distribution() ) iep_mock.return_value = [ something_else_ep, - jsons_path_ep + primitives_ep ] # run - paths = discovery.get_primitives_paths() + paths = discovery._load_entry_points('primitives') # assert expected = [ - 'a', - 'b', 'this/is/a/fake' ] assert paths == expected - iep_mock.assert_called_once_with('mlprimitives') + expected_calls = [ + call('mlblocks'), + ] + assert iep_mock.call_args_list == expected_calls @patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) -def test_load_primitive_value_error(): - with pytest.raises(ValueError): - discovery.load_primitive('invalid.primitive') +@patch('mlblocks.discovery._load_entry_points') +def test_get_primitives_paths(lep_mock): + lep_mock.side_effect = [['c'], []] + + paths = discovery.get_primitives_paths() + + assert paths == ['a', 'b', 'c'] + expected_calls = [ + call('primitives'), + call('jsons_path', 'mlprimitives'), + ] + assert lep_mock.call_args_list == expected_calls + + +@patch('mlblocks.discovery._PIPELINES_PATHS', new=['a', 'b']) +@patch('mlblocks.discovery._load_entry_points') +def test_get_pipelines_paths(lep_mock): + lep_mock.return_value = ['c'] + paths = discovery.get_pipelines_paths() -def test_load_primitive_success(): + assert paths == ['a', 'b', 'c'] + lep_mock.assert_called_once_with('pipelines') + + +def test__load_value_error(): + primitive = discovery._load('invalid.primitive', ['a', 'b']) + + assert primitive is None + + +def test__load_success(): primitive = { 'name': 'temp.primitive', 'primitive': 'temp.primitive' } with tempfile.TemporaryDirectory() as tempdir: - discovery.add_primitives_path(tempdir) + paths = [tempdir] primitive_path = os.path.join(tempdir, 'temp.primitive.json') with open(primitive_path, 'w') as primitive_file: json.dump(primitive, primitive_file, indent=4) - loaded = discovery.load_primitive('temp.primitive') + loaded = discovery._load('temp.primitive', paths) assert primitive == loaded + + +@patch('mlblocks.discovery.get_primitives_paths') +@patch('mlblocks.discovery._load') +def test__load_primitive_value_error(load_mock, gpp_mock): + load_mock.return_value = None + gpp_mock.return_value = ['a', 'b'] + + with pytest.raises(ValueError): + discovery.load_primitive('invalid.primitive') + + load_mock.assert_called_once_with('invalid.primitive', ['a', 'b']) + + +@patch('mlblocks.discovery.get_primitives_paths') +@patch('mlblocks.discovery._load') +def test__load_primitive_success(load_mock, gpp_mock): + gpp_mock.return_value = ['a', 'b'] + + primitive = discovery.load_primitive('valid.primitive') + + load_mock.assert_called_once_with('valid.primitive', ['a', 'b']) + + assert primitive == load_mock.return_value + + +@patch('mlblocks.discovery.get_pipelines_paths') +@patch('mlblocks.discovery._load') +def test__load_pipeline_value_error(load_mock, gpp_mock): + load_mock.return_value = None + gpp_mock.return_value = ['a', 'b'] + + with pytest.raises(ValueError): + discovery.load_pipeline('invalid.pipeline') + + load_mock.assert_called_once_with('invalid.pipeline', ['a', 'b']) + + +@patch('mlblocks.discovery.get_pipelines_paths') +@patch('mlblocks.discovery._load') +def test__load_pipeline_success(load_mock, gpp_mock): + gpp_mock.return_value = ['a', 'b'] + + pipeline = discovery.load_pipeline('valid.pipeline') + + load_mock.assert_called_once_with('valid.pipeline', ['a', 'b']) + + assert pipeline == load_mock.return_value From e5de2532b0c83d27c72efc615bd8c680720a5f2d Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 22 May 2019 19:39:52 -0400 Subject: [PATCH 030/160] Update docs about primitives entry_points --- docs/advanced_usage/adding_primitives.rst | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/docs/advanced_usage/adding_primitives.rst b/docs/advanced_usage/adding_primitives.rst index e3d4b964..9d358629 100644 --- a/docs/advanced_usage/adding_primitives.rst +++ b/docs/advanced_usage/adding_primitives.rst @@ -91,20 +91,27 @@ In order to make **MLBLocks** able to find the primitives defined in such a libr all you need to do is setting up an `Entry Point`_ in your `setup.py` script with the following specification: -1. It has to be published under the name ``mlprimitives``. -2. It has to be named exactly ``jsons_path``. -3. It has to point at a variable that contains the path to the JSONS folder. +1. It has to be published under the group ``mlblocks``. +2. It has to be named exactly ``primitives``. +3. It has to point at a variable that contains a path or a list of paths to the JSONS folder(s). An example of such an entry point would be:: entry_points = { - 'mlprimitives': [ - 'jsons_path=some_module:SOME_VARIABLE' + 'mlblocks': [ + 'primitives=some_module:SOME_VARIABLE' ] } where the module `some_module` contains a variable such as:: - SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') + SOME_VARIABLE = 'path/to/primitives' + +or:: + + SOME_VARIABLE = [ + 'path/to/primitives', + 'path/to/more/primitives' + ] .. _Entry Point: https://packaging.python.org/specifications/entry-points/ From 67df52a740a2c4cfe2d0bad9de9168c70723634c Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 23 May 2019 18:11:44 -0400 Subject: [PATCH 031/160] Add functions to explore pipelines and primitives --- mlblocks/discovery.py | 77 +++++++++++++++++++++++++++++++++++++---- mlblocks/mlblock.py | 12 ++++--- tests/test_discovery.py | 4 +-- 3 files changed, 79 insertions(+), 14 deletions(-) diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py index b5ca840d..40853de9 100644 --- a/mlblocks/discovery.py +++ b/mlblocks/discovery.py @@ -11,6 +11,7 @@ import json import logging import os +import re import sys import pkg_resources @@ -23,6 +24,7 @@ os.path.join(os.getcwd(), 'mlblocks_primitives'), # legacy os.path.join(sys.prefix, 'mlblocks_primitives'), # legacy ] + _PIPELINES_PATHS = [ os.path.join(os.getcwd(), 'mlpipelines'), ] @@ -168,7 +170,7 @@ def get_primitives_paths(): The list of folders. """ paths = _load_entry_points('primitives') + _load_entry_points('jsons_path', 'mlprimitives') - return _PRIMITIVES_PATHS + paths + return _PRIMITIVES_PATHS + list(set(paths)) def get_pipelines_paths(): @@ -228,6 +230,9 @@ def _load(name, paths): return json.load(json_file) +_PRIMITIVES = dict() + + def load_primitive(name): """Locate and load the primitive JSON annotation. @@ -247,13 +252,20 @@ def load_primitive(name): ValueError: A ``ValueError`` will be raised if the primitive cannot be found. """ - primitive = _load(name, get_primitives_paths()) - if not primitive: - raise ValueError("Unknown primitive: {}".format(name)) + primitive = _PRIMITIVES.get(name) + if primitive is None: + primitive = _load(name, get_primitives_paths()) + if primitive is None: + raise ValueError("Unknown primitive: {}".format(name)) + + _PRIMITIVES[name] = primitive return primitive +_PIPELINES = dict() + + def load_pipeline(name): """Locate and load the pipeline JSON annotation. @@ -273,8 +285,59 @@ def load_pipeline(name): ValueError: A ``ValueError`` will be raised if the pipeline cannot be found. """ - pipeline = _load(name, get_pipelines_paths()) - if not pipeline: - raise ValueError("Unknown pipeline: {}".format(name)) + pipeline = _PIPELINES.get(name) + if pipeline is None: + pipeline = _load(name, get_pipelines_paths()) + if pipeline is None: + raise ValueError("Unknown pipeline: {}".format(name)) + + _PIPELINES[name] = pipeline return pipeline + + +def _search_annotations(base_path, pattern, parts=None): + annotations = dict() + parts = parts or list() + if os.path.exists(base_path): + for name in os.listdir(base_path): + path = os.path.abspath(os.path.join(base_path, name)) + if os.path.isdir(path): + annotations.update(_search_annotations(path, pattern, parts + [name])) + elif path not in annotations: + name = '.'.join(parts + [name]) + if pattern.search(name) and name.endswith('.json'): + annotations[path] = name[:-5] + + return annotations + + +def _get_annotations_list(paths, loader, pattern, **metadata_filters): + pattern = re.compile(pattern) + annotations = dict() + for base_path in paths: + annotations.update(_search_annotations(base_path, pattern)) + + matching = list() + for name in sorted(annotations.values()): + annotation = loader(name) + metadata = annotation.get('metadata', dict()) + for key, value in metadata_filters.items(): + metadata_value = metadata.get(key, '') + if not re.search(value, metadata_value): + break + + else: + matching.append(name) + + return matching + + +def get_primitives_list(pattern='', **metadata_filters): + return _get_annotations_list( + get_primitives_paths(), load_primitive, pattern, **metadata_filters) + + +def get_pipelines_list(pattern='', **metadata_filters): + return _get_annotations_list( + get_pipelines_paths(), load_pipeline, pattern, **metadata_filters) diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index 66bbf8fe..6370b4cf 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -27,6 +27,8 @@ class MLBlock(): Attributes: name (str): Name given to this MLBlock. + metadata (dict): + Additional information about this primitive primitive (object): the actual function or instance which this MLBlock wraps. fit_args (dict): @@ -143,22 +145,22 @@ def _get_tunable(cls, hyperparameters, init_params): def __init__(self, name, **kwargs): self.name = name - metadata = load_primitive(name) + primitive = load_primitive(name) - self.primitive = import_object(metadata['primitive']) + self.primitive = import_object(primitive['primitive']) - self._fit = metadata.get('fit', dict()) + self._fit = primitive.get('fit', dict()) self.fit_args = self._fit.get('args', []) self.fit_method = self._fit.get('method') - self._produce = metadata['produce'] + self._produce = primitive['produce'] self.produce_args = self._produce['args'] self.produce_output = self._produce['output'] self.produce_method = self._produce.get('method') self._class = bool(self.produce_method) - hyperparameters = metadata.get('hyperparameters', dict()) + hyperparameters = primitive.get('hyperparameters', dict()) init_params, fit_params, produce_params = self._extract_params(kwargs, hyperparameters) self._hyperparameters = init_params diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 59bd4404..3681611b 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -11,7 +11,7 @@ from mlblocks import discovery -FAKE_MLPRIMITIVES_PATH = 'this/is/a/fake' +FAKE_PRIMITIVES_PATH = 'this/is/a/fake' def test__add_lookup_path_do_nothing(): @@ -78,7 +78,7 @@ def test__load_entry_points_entry_points(iep_mock): primitives_ep = EntryPoint( 'primitives', 'tests.test_discovery', - attrs=['FAKE_MLPRIMITIVES_PATH'], + attrs=['FAKE_PRIMITIVES_PATH'], dist=Distribution() ) iep_mock.return_value = [ From 467948e4088915eabbe2e6853e2d88408a10e96d Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 4 Jul 2019 17:57:48 -0400 Subject: [PATCH 032/160] Add support to work with hyperparameters in the format used by BTB --- mlblocks/mlpipeline.py | 103 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 100 insertions(+), 3 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index b31502ea..3c08f444 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -4,7 +4,9 @@ import json import logging -from collections import Counter, OrderedDict +from collections import Counter, OrderedDict, defaultdict + +import numpy as np from mlblocks.discovery import load_pipeline from mlblocks.mlblock import MLBlock @@ -161,18 +163,112 @@ def get_tunable_hyperparameters(self): """ return self._tunable_hyperparameters.copy() - def get_hyperparameters(self): + @classmethod + def _sanitize_value(cls, value): + """Convert numpy values to their python primitive type equivalent. + + If a value is a dict, recursively sanitize its values. + + Args: + value: + value to sanitize. + + Returns: + sanitized value. + """ + if isinstance(value, dict): + return { + key: cls._sanitize_value(value) + for key, value in value.items() + } + if isinstance(value, np.integer): + return int(value) + elif isinstance(value, np.floating): + return float(value) + elif isinstance(value, np.ndarray): + return value.tolist() + elif isinstance(value, np.bool_): + return bool(value) + elif value == 'None': + return None + + return value + + @classmethod + def _sanitize(cls, hyperparameters): + """Convert tuple hyperparameter keys to nested dicts. + + Also convert numpy types to primary python types. + + The input hyperparameters dict can specify them in two formats: + + One is the native MLBlocks format, where each key is the name of a block and each value + is a dict containing a complete hyperparameter specification for that block:: + + { + "block_name": { + "hyperparameter_name": "hyperparameter_value", + ... + }, + ... + } + + The other one is an alternative format where each key is a two element tuple containing + the name of the block as the first element and the name of the hyperparameter as the + second one:: + + { + ("block_name", "hyperparameter_name"): "hyperparameter_value", + ... + } + + + Args: + hyperparaeters (dict): + hyperparameters dict to sanitize. + + Returns: + dict: + Sanitized dict. + """ + params_tree = defaultdict(dict) + for key, value in hyperparameters.items(): + value = cls._sanitize_value(value) + if isinstance(key, tuple): + block, hyperparameter = key + params_tree[block][hyperparameter] = value + else: + params_tree[key] = value + + return params_tree + + def get_hyperparameters(self, flat=False): """Get the current hyperparamters of each block. + Args: + flat (bool): If True, return a flattened dictionary where each key + is a two elements tuple containing the name of the block as the first + element and the name of the hyperparameter as the second one. + If False (default), return a dictionary where each key is the name of + a block and each value is a dictionary containing the complete + hyperparameter specification of that block. + Returns: dict: A dictionary containing the block names as keys and the current block hyperparameters dictionary as values. """ - hyperparameters = {} + hyperparameters = dict() for block_name, block in self.blocks.items(): hyperparameters[block_name] = block.get_hyperparameters() + if flat: + hyperparameters = { + (block, name): value + for block, block_hyperparameters in hyperparameters.items() + for name, value in block_hyperparameters.items() + } + return hyperparameters def set_hyperparameters(self, hyperparameters): @@ -183,6 +279,7 @@ def set_hyperparameters(self, hyperparameters): A dictionary containing the block names as keys and the new hyperparameters dictionary as values. """ + hyperparameters = self._sanitize(hyperparameters) for block_name, block_hyperparams in hyperparameters.items(): self.blocks[block_name].set_hyperparameters(block_hyperparams) From 4a91c74badac8da64c02b2f25e935a734e71dcc0 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 5 Jul 2019 19:55:19 -0400 Subject: [PATCH 033/160] return flat tunables and add tests --- mlblocks/mlpipeline.py | 30 +++++++--- tests/test_mlpipeline.py | 117 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 138 insertions(+), 9 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 3c08f444..36b71b29 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -153,15 +153,35 @@ def __init__(self, pipeline=None, primitives=None, init_params=None, if hyperparameters: self.set_hyperparameters(hyperparameters) - def get_tunable_hyperparameters(self): + @staticmethod + def _flatten_dict(hyperparameters): + return { + (block, name): value + for block, block_hyperparameters in hyperparameters.items() + for name, value in block_hyperparameters.items() + } + + def get_tunable_hyperparameters(self, flat=False): """Get the tunable hyperparamters of each block. + Args: + flat (bool): If True, return a flattened dictionary where each key + is a two elements tuple containing the name of the block as the first + element and the name of the hyperparameter as the second one. + If False (default), return a dictionary where each key is the name of + a block and each value is a dictionary containing the complete + hyperparameter specification of that block. + Returns: dict: A dictionary containing the block names as keys and the block tunable hyperparameters dictionary as values. """ - return self._tunable_hyperparameters.copy() + tunables = self._tunable_hyperparameters.copy() + if flat: + tunables = self._flatten_dict(tunables) + + return tunables @classmethod def _sanitize_value(cls, value): @@ -263,11 +283,7 @@ def get_hyperparameters(self, flat=False): hyperparameters[block_name] = block.get_hyperparameters() if flat: - hyperparameters = { - (block, name): value - for block, block_hyperparameters in hyperparameters.items() - for name, value in block_hyperparameters.items() - } + hyperparameters = self._flatten_dict(hyperparameters) return hyperparameters diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index 741be194..906c2c61 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -85,9 +85,72 @@ def test_get_tunable_hyperparameters(self): assert returned == tunable assert returned is not tunable + def test_get_tunable_hyperparameters_flat(self): + mlpipeline = MLPipeline(list()) + tunable = { + 'block_1': { + 'hp_1': { + 'type': 'int', + 'range': [ + 1, + 10 + ], + } + }, + 'block_2': { + 'hp_1': { + 'type': 'str', + 'default': 'a', + 'values': [ + 'a', + 'b', + 'c' + ], + }, + 'hp_2': { + 'type': 'bool', + 'default': True, + } + } + } + mlpipeline._tunable_hyperparameters = tunable + + returned = mlpipeline.get_tunable_hyperparameters(flat=True) + + expected = { + ('block_1', 'hp_1'): { + 'type': 'int', + 'range': [ + 1, + 10 + ], + }, + ('block_2', 'hp_1'): { + 'type': 'str', + 'default': 'a', + 'values': [ + 'a', + 'b', + 'c' + ], + }, + ('block_2', 'hp_2'): { + 'type': 'bool', + 'default': True, + } + } + assert returned == expected + def test_get_hyperparameters(self): block_1 = Mock() + block_1.get_hyperparameters.return_value = { + 'a': 'a' + } block_2 = Mock() + block_2.get_hyperparameters.return_value = { + 'b': 'b', + 'c': 'c', + } blocks = OrderedDict(( ('a.primitive.Name#1', block_1), ('a.primitive.Name#2', block_2), @@ -98,8 +161,40 @@ def test_get_hyperparameters(self): hyperparameters = mlpipeline.get_hyperparameters() assert hyperparameters == { - 'a.primitive.Name#1': block_1.get_hyperparameters.return_value, - 'a.primitive.Name#2': block_2.get_hyperparameters.return_value, + 'a.primitive.Name#1': { + 'a': 'a', + }, + 'a.primitive.Name#2': { + 'b': 'b', + 'c': 'c', + }, + } + block_1.get_hyperparameters.assert_called_once_with() + block_2.get_hyperparameters.assert_called_once_with() + + def test_get_hyperparameters_flat(self): + block_1 = Mock() + block_1.get_hyperparameters.return_value = { + 'a': 'a' + } + block_2 = Mock() + block_2.get_hyperparameters.return_value = { + 'b': 'b', + 'c': 'c', + } + blocks = OrderedDict(( + ('a.primitive.Name#1', block_1), + ('a.primitive.Name#2', block_2), + )) + mlpipeline = MLPipeline(list()) + mlpipeline.blocks = blocks + + hyperparameters = mlpipeline.get_hyperparameters(flat=True) + + assert hyperparameters == { + ('a.primitive.Name#1', 'a'): 'a', + ('a.primitive.Name#2', 'b'): 'b', + ('a.primitive.Name#2', 'c'): 'c', } block_1.get_hyperparameters.assert_called_once_with() block_2.get_hyperparameters.assert_called_once_with() @@ -124,6 +219,24 @@ def test_set_hyperparameters(self): block_1.set_hyperparameters.assert_not_called() block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'}) + def test_set_hyperparameters_flat(self): + block_1 = Mock() + block_2 = Mock() + blocks = OrderedDict(( + ('a.primitive.Name#1', block_1), + ('a.primitive.Name#2', block_2), + )) + mlpipeline = MLPipeline(list()) + mlpipeline.blocks = blocks + + hyperparameters = { + ('a.primitive.Name#2', 'some'): 'arg' + } + mlpipeline.set_hyperparameters(hyperparameters) + + block_1.set_hyperparameters.assert_not_called() + block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'}) + def test__get_block_args(self): pass From 5bd5a709f853b06b564bd607ee904ea9f95269c9 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 5 Jul 2019 20:17:45 -0400 Subject: [PATCH 034/160] Fix setuptools version to fix dependency issues on tests --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9fca4dfa..9c7b3d2e 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,8 @@ 'pytest>=3.4.2', 'pytest-cov>=2.6.0', 'mlprimitives>=0.1.3,<0.2', - 'urllib3>=1.20,<1.25' + 'urllib3>=1.20,<1.25', + 'setuptools>=41.0.0' ] From 4dcf6022a78ca7230c7c0f714bd7185fdc4dd195 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 8 Jul 2019 13:29:25 -0400 Subject: [PATCH 035/160] Add docs for intermediate outputs --- docs/advanced_usage/pipelines.rst | 82 ++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/docs/advanced_usage/pipelines.rst b/docs/advanced_usage/pipelines.rst index 33d57cdc..e87a0067 100644 --- a/docs/advanced_usage/pipelines.rst +++ b/docs/advanced_usage/pipelines.rst @@ -271,7 +271,7 @@ Like primitives, Pipelines can also be annotated and stored as dicts or JSON fil the different arguments expected by the ``MLPipeline`` class, as well as the set hyperparameters and tunable hyperparameters. -Representing a Pipeline as a dict +Representing a Pipeline as a dict ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The dict representation of an Pipeline can be obtained directly from an ``MLPipeline`` instance, @@ -344,6 +344,86 @@ that allows loading the pipeline directly from a JSON file: pipeline = MLPipeline.load('pipeline.json') + +Intermediate Outputs and Partial Execution +------------------------------------------ + +Sometimes we might be interested in capturing an intermediate output within a +pipeline execution in order to inspect it, for debugging purposes, or to reuse +it later on in order to speed up a tuning process where the pipeline needs +to be executed multiple times over the same data. + +For this, two special arguments have been included in the ``fit`` and ``predict`` +methods of an MLPipeline: + +output\_ +~~~~~~~~ + +The ``output_`` argument indicates which block within the pipeline we are interested +in taking the output values from. This, implicitly, indicates up to which block the +pipeline needs to be executed within ``fit`` and ``predict`` before returning. + +The ``output_`` argument is optional, and it can either be ``None``, which is the default, +and Integer or a String. + +And its format is as follows: + +* If it is ``None`` (default), the ``fit`` method will return nothing and the + ``predict`` method will return the output of the last block in the pipeline. +* If an integer is given, it is interpreted as the block index, starting on 0, + and the whole context after executing the specified block will be returned. + In case of ``fit``, this means that the outputs will be returned after fitting + a block and then producing it on the same data. +* If it is a string, it can be interpreted in three ways: + + * **block name**: If the string matches a block name exactly, including + its hash and counter number ``#n`` at the end, the whole context will be + returned after that block is produced. + * **variable_name**: If the string does not match any block name and does + not contain any dot character, ``'.'``, it will be considered a variable + name. In this case, the indicated variable will be extracted from the + context and returned after the last block has been produced. + * **block_name + variable_name**: If the complete string does not match a + block name but it contains at least one dot, ``'.'``, it will be split + in two parts on the last dot. If the first part of the string matches a + block name exactly, the second part of the string will be considered a + variable name, assuming the format ``{block_name}.{variable_name}``, and + the indicated variable will be extracted from the context and returned + after the block has been produced. Otherwise, if the extracted + ``block_name`` does not match a block name exactly, a ``ValueError`` + will be raised. + +start\_ +~~~~~~~ + +The ``start_`` argument indicates which block within the pipeline we are interested +in starting the computation from when executing ``fit`` and ``predict``, allowing us +to skip some of the initial blocks. + +The ``start_`` argument is optional, and it can either be ``None``, which is the default, +and Integer or a String. + +And its format is as follows: + +* If it is ``None``, the execution will start on the first block. +* If it is an integer, it is interpreted as the block index +* If it is a string, it is expected to be the name of the block, including the counter + number at the end. + +This is specially useful when used in combination with the ``output_`` argument, as it +effectively allows us to both capture intermediate outputs for debugging purposes or +reusing intermediate states of the pipeline to accelerate tuning processes. + +An example of this situation, where we want to reuse the output of the first block, could be:: + + context_0 = pipeline.fit(X_train, y_train, output_=0) + + # Afterwards, within the tuning loop + pipeline.fit(start_=1, **context_0) + predictions = pipeline.predict(X_test) + score = compute_score(y_test, predictions) + + .. _API Reference: ../api_reference.html .. _primitives: ../primitives.html .. _mlblocks.MLPipeline: ../api_reference.html#mlblocks.MLPipeline From f93c8b155e6c17cc589bac2a6364e0db7443927d Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 8 Jul 2019 14:37:57 -0400 Subject: [PATCH 036/160] Add release notes for v0.3.1 --- HISTORY.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index a312c9cb..e6b14674 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,20 @@ Changelog ========= +0.3.1 - Pipelines Discovery +--------------------------- + +* Support flat hyperparameter dictionaries + [Issue #92](https://github.com/HDI-Project/MLBlocks/issues/92) by @csala +* Load pipelines by name and register them as `entry_points` + [Issue #88](https://github.com/HDI-Project/MLBlocks/issues/88) by @csala +* Implement partial re-fit + [Issue #61](https://github.com/HDI-Project/MLBlocks/issues/61) by @csala +* Move argument parsing to MLBlock + [Issue #86](https://github.com/HDI-Project/MLBlocks/issues/86) by @csala +* Allow getting intermediate outputs + [Issue #58](https://github.com/HDI-Project/MLBlocks/issues/58) by @csala + 0.3.0 - New Primitives Discovery -------------------------------- From 0d3ba9245e93a83f6a5d674e4cf84917ec3f898b Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 8 Jul 2019 14:38:04 -0400 Subject: [PATCH 037/160] =?UTF-8?q?Bump=20version:=200.3.1-dev=20=E2=86=92?= =?UTF-8?q?=200.3.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 7 ++++--- setup.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 37199013..b47c8962 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.3.1-dev' +__version__ = '0.3.1' __all__ = [ 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index 17244565..d4103297 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,15 +1,15 @@ [bumpversion] -current_version = 0.3.1-dev +current_version = 0.3.1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? -serialize = +serialize = {major}.{minor}.{patch}-{release} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = release -values = +values = dev release @@ -51,3 +51,4 @@ max-line-length = 99 [pydocstyle] add-ignore = D403,D413,D105,D107 + diff --git a/setup.py b/setup.py index 9c7b3d2e..3f01d72e 100644 --- a/setup.py +++ b/setup.py @@ -98,6 +98,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/MLBlocks', - version='0.3.1-dev', + version='0.3.1', zip_safe=False, ) From 28a9a44373d10cd0b8e41ead686889535a4b7269 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 8 Jul 2019 14:38:21 -0400 Subject: [PATCH 038/160] =?UTF-8?q?Bump=20version:=200.3.1=20=E2=86=92=200?= =?UTF-8?q?.3.2-dev?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index b47c8962..b528aefe 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.3.1' +__version__ = '0.3.2-dev' __all__ = [ 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index d4103297..1967b27b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.1 +current_version = 0.3.2-dev commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? diff --git a/setup.py b/setup.py index 3f01d72e..98350606 100644 --- a/setup.py +++ b/setup.py @@ -98,6 +98,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/MLBlocks', - version='0.3.1', + version='0.3.2-dev', zip_safe=False, ) From 677ef256ef5e23c4abfe52b8b5a2f839bf5cdf1d Mon Sep 17 00:00:00 2001 From: Kalyan Veeramachaneni Date: Sun, 14 Jul 2019 19:01:25 -0700 Subject: [PATCH 039/160] Update README.md --- README.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 01629dc8..5b4f2519 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,16 @@ -

-“MLBlocks” +

+“MLBlocksr” +An open source project from Data to AI Lab at MIT.

-

- + + +

+“MLBlocks” +

+ +

Pipelines and Primitives for Machine Learning and Data Science. -

[![PyPi][pypi-img]][pypi-url] From 98b4d245c5cefc68f1ce3d1a7217f961dfe3378c Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 17 Jul 2019 14:29:07 +0200 Subject: [PATCH 040/160] Isolate block hyperparams from primitives --- mlblocks/mlblock.py | 9 +++++---- tests/test_mlblock.py | 46 +++++++++++++++++++++++++++++++++---------- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index 66bbf8fe..fa67bd6b 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -4,6 +4,7 @@ import importlib import logging +from copy import deepcopy from mlblocks.discovery import load_primitive @@ -192,7 +193,7 @@ def get_tunable_hyperparameters(self): tuned, their types and, if applicable, the accepted ranges or values. """ - return self._tunable.copy() + return deepcopy(self._tunable) def get_hyperparameters(self): """Get hyperparameters values that the current MLBlock is using. @@ -202,7 +203,7 @@ def get_hyperparameters(self): the dictionary containing the hyperparameter values that the MLBlock is currently using. """ - return self._hyperparameters.copy() + return deepcopy(self._hyperparameters) def set_hyperparameters(self, hyperparameters): """Set new hyperparameters. @@ -221,7 +222,7 @@ def set_hyperparameters(self, hyperparameters): if self._class: LOGGER.debug('Creating a new primitive instance for %s', self.name) - self.instance = self.primitive(**self._hyperparameters) + self.instance = self.primitive(**self.get_hyperparameters()) def _get_method_kwargs(self, kwargs, method_args): """Prepare the kwargs for the method. @@ -307,5 +308,5 @@ def produce(self, **kwargs): if self._class: return getattr(self.instance, self.produce_method)(**produce_kwargs) - produce_kwargs.update(self._hyperparameters) + produce_kwargs.update(self.get_hyperparameters()) return self.primitive(**produce_kwargs) diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py index 5273d40c..16f1c6d1 100644 --- a/tests/test_mlblock.py +++ b/tests/test_mlblock.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from unittest import TestCase -from unittest.mock import patch +from unittest.mock import MagicMock, Mock, patch from mlblocks.mlblock import MLBlock, import_object @@ -403,27 +403,53 @@ def test_get_tunable_hyperparameters(self, load_primitive_mock, import_object_mo assert returned == tunable assert returned is not tunable + @patch('mlblocks.mlblock.import_object', new=Mock()) + @patch('mlblocks.mlblock.load_primitive', new=MagicMock()) + def test_get_hyperparameters(self): + """get_hyperparameters has to return a deepcopy of the _hyperparameters attribute.""" + mlblock = MLBlock('given_primitive_name') + + hyperparameters = { + 'a_list_param': ['a'] + } + mlblock._hyperparameters = hyperparameters + + returned = mlblock.get_hyperparameters() + + assert returned == hyperparameters + assert returned is not hyperparameters + + returned['a_list_param'].append('b') + assert 'b' not in hyperparameters['a_list_param'] + @patch('mlblocks.mlblock.import_object') @patch('mlblocks.mlblock.load_primitive') - def test_get_hyperparameters(self, load_primitive_mock, import_object_mock): - """get_hyperparameters has to return a copy of the _hyperparameters attribute.""" - load_primitive_mock.return_value = { - 'primitive': 'a_primitive_name', + def test_modify_hyperparameters(self, lp_mock, io_mock): + """If a primitive method modifies the hyperparameters, changes should not persist.""" + + def primitive(a_list_param): + a_list_param.append('b') + + io_mock.return_value = primitive + + lp_mock.return_value = { + 'primitive': 'a_primitive', 'produce': { 'args': [], 'output': [] } } - mlblock = MLBlock('given_primitive_name') + mlblock = MLBlock('a_primitive') - hyperparameters = dict() + hyperparameters = { + 'a_list_param': ['a'] + } mlblock._hyperparameters = hyperparameters - returned = mlblock.get_hyperparameters() + mlblock.produce() - assert returned == hyperparameters - assert returned is not hyperparameters + assert 'b' not in hyperparameters['a_list_param'] def test_set_hyperparameters_function(self): pass From 735f48d02f2d73f019d9623fcfcc0920abfb6904 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 17 Jul 2019 15:07:39 +0200 Subject: [PATCH 041/160] Add fit and produce default arg values --- mlblocks/mlpipeline.py | 4 ++++ tests/test_mlpipeline.py | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 36b71b29..e19a68ee 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -330,6 +330,10 @@ def _get_block_args(self, block_name, block_args, context): if variable in context: kwargs[name] = context[variable] + elif 'default' in arg: + kwargs[name] = arg['default'] + elif arg.get('required', True): + raise ValueError('Input variable {} not found in context'.format(variable)) return kwargs diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index 906c2c61..2011f5ae 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -238,7 +238,42 @@ def test_set_hyperparameters_flat(self): block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'}) def test__get_block_args(self): - pass + input_names = { + 'a_block': { + 'arg_3': 'arg_3_alt' + } + } + pipeline = MLPipeline(list(), input_names=input_names) + + block_args = [ + { + 'name': 'arg_1', + }, + { + 'name': 'arg_2', + 'default': 'arg_2_value' + }, + { + 'name': 'arg_3', + }, + { + 'name': 'arg_4', + 'required': False + }, + ] + context = { + 'arg_1': 'arg_1_value', + 'arg_3_alt': 'arg_3_value' + } + + args = pipeline._get_block_args('a_block', block_args, context) + + expected = { + 'arg_1': 'arg_1_value', + 'arg_2': 'arg_2_value', + 'arg_3': 'arg_3_value', + } + assert args == expected def test__get_outputs(self): pass From 2662fea39476dfc30914a9ded59caecdfe51ad0c Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 26 Jul 2019 19:03:12 +0200 Subject: [PATCH 042/160] Fix dependencies --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 98350606..0d9f766b 100644 --- a/setup.py +++ b/setup.py @@ -46,6 +46,7 @@ 'ipython>=6.5.0', 'matplotlib>=2.2.3', 'autodocsumm>=0.1.10', + 'docutils<0.15,>=0.10', # botocore incompatibility with 0.15 # style check 'flake8>=3.5.0', From ae6ab0983b10598a214fe9af2eb25e18a7442a5e Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 26 Jul 2019 23:22:25 +0200 Subject: [PATCH 043/160] Fix testing dependencies --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0d9f766b..608e481d 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,8 @@ 'pytest-cov>=2.6.0', 'mlprimitives>=0.1.3,<0.2', 'urllib3>=1.20,<1.25', - 'setuptools>=41.0.0' + 'setuptools>=41.0.0', + 'numpy<1.17', ] From cd005af297f72cc5b6cb6b29228de14de992b920 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 6 Aug 2019 14:03:10 +0200 Subject: [PATCH 044/160] Flexible filter searching --- mlblocks/discovery.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py index 40853de9..6d85f970 100644 --- a/mlblocks/discovery.py +++ b/mlblocks/discovery.py @@ -312,7 +312,23 @@ def _search_annotations(base_path, pattern, parts=None): return annotations -def _get_annotations_list(paths, loader, pattern, **metadata_filters): +def _match_filter(annotation, key, value): + if '.' in key: + name, key = key.split('.', 1) + part = annotation.get(name) or dict() + return _match_filter(part, key, value) + + annotation_value = annotation.get(key) + if not isinstance(annotation_value, type(value)): + if isinstance(annotation_value, (list, dict)): + return value in annotation_value + elif isinstance(value, (list, dict)): + return annotation_value in value + + return annotation_value == value + + +def _get_annotations_list(paths, loader, pattern, filters): pattern = re.compile(pattern) annotations = dict() for base_path in paths: @@ -321,10 +337,8 @@ def _get_annotations_list(paths, loader, pattern, **metadata_filters): matching = list() for name in sorted(annotations.values()): annotation = loader(name) - metadata = annotation.get('metadata', dict()) - for key, value in metadata_filters.items(): - metadata_value = metadata.get(key, '') - if not re.search(value, metadata_value): + for key, value in filters.items(): + if not _match_filter(annotation, key, value): break else: @@ -333,11 +347,11 @@ def _get_annotations_list(paths, loader, pattern, **metadata_filters): return matching -def get_primitives_list(pattern='', **metadata_filters): - return _get_annotations_list( - get_primitives_paths(), load_primitive, pattern, **metadata_filters) +def get_primitives_list(pattern='', filters=None): + filters = filters or dict() + return _get_annotations_list(get_primitives_paths(), load_primitive, pattern, filters) -def get_pipelines_list(pattern='', **metadata_filters): - return _get_annotations_list( - get_pipelines_paths(), load_pipeline, pattern, **metadata_filters) +def get_pipelines_list(pattern='', filters=None): + filters = filters or dict() + return _get_annotations_list(get_pipelines_paths(), load_pipeline, pattern, filters) From 82ef5b53bd5ccd54c8971ae64479bc79d64f35ba Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 6 Aug 2019 21:04:13 +0200 Subject: [PATCH 045/160] Rename find_primitives and add tests --- mlblocks/discovery.py | 10 +- tests/test_discovery.py | 201 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 203 insertions(+), 8 deletions(-) diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py index 6d85f970..db7ba40d 100644 --- a/mlblocks/discovery.py +++ b/mlblocks/discovery.py @@ -312,11 +312,11 @@ def _search_annotations(base_path, pattern, parts=None): return annotations -def _match_filter(annotation, key, value): +def _match(annotation, key, value): if '.' in key: name, key = key.split('.', 1) part = annotation.get(name) or dict() - return _match_filter(part, key, value) + return _match(part, key, value) annotation_value = annotation.get(key) if not isinstance(annotation_value, type(value)): @@ -338,7 +338,7 @@ def _get_annotations_list(paths, loader, pattern, filters): for name in sorted(annotations.values()): annotation = loader(name) for key, value in filters.items(): - if not _match_filter(annotation, key, value): + if not _match(annotation, key, value): break else: @@ -347,11 +347,11 @@ def _get_annotations_list(paths, loader, pattern, filters): return matching -def get_primitives_list(pattern='', filters=None): +def find_primitives(pattern='', filters=None): filters = filters or dict() return _get_annotations_list(get_primitives_paths(), load_primitive, pattern, filters) -def get_pipelines_list(pattern='', filters=None): +def find_pipelines(pattern='', filters=None): filters = filters or dict() return _get_annotations_list(get_pipelines_paths(), load_pipeline, pattern, filters) diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 3681611b..07fc0753 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -2,9 +2,10 @@ import json import os +import re import tempfile import uuid -from unittest.mock import call, patch +from unittest.mock import Mock, call, patch import pytest from pkg_resources import Distribution, EntryPoint @@ -12,6 +13,10 @@ from mlblocks import discovery FAKE_PRIMITIVES_PATH = 'this/is/a/fake' +FAKE_PRIMITIVES_PATHS = [ + 'this/is/another/fake', + 'this/is/yet/another/fake', +] def test__add_lookup_path_do_nothing(): @@ -81,9 +86,16 @@ def test__load_entry_points_entry_points(iep_mock): attrs=['FAKE_PRIMITIVES_PATH'], dist=Distribution() ) + another_primitives_ep = EntryPoint( + 'primitives', + 'tests.test_discovery', + attrs=['FAKE_PRIMITIVES_PATHS'], + dist=Distribution() + ) iep_mock.return_value = [ something_else_ep, - primitives_ep + primitives_ep, + another_primitives_ep ] # run @@ -91,7 +103,9 @@ def test__load_entry_points_entry_points(iep_mock): # assert expected = [ - 'this/is/a/fake' + 'this/is/a/fake', + 'this/is/another/fake', + 'this/is/yet/another/fake', ] assert paths == expected @@ -196,3 +210,184 @@ def test__load_pipeline_success(load_mock, gpp_mock): load_mock.assert_called_once_with('valid.pipeline', ['a', 'b']) assert pipeline == load_mock.return_value + + +@patch('mlblocks.discovery.os') +def test__search_annotations(os_mock): + os_mock.path.abspath = os.path.abspath + os_mock.path.join = os.path.join + os_mock.path.exists.return_value = True + os_mock.listdir.side_effect = [ + [ + 'a.primitive.json', + 'another.primitive.json', + 'some', + ], + [ + 'other', + ], + [ + 'primitive.json' + ] + ] + os_mock.path.isdir.return_value = False + os_mock.path.isdir.side_effect = [ + False, + False, + True, + True, + False + ] + + pattern = re.compile('other') + annotations = discovery._search_annotations('/path/to', pattern) + + assert annotations == { + '/path/to/another.primitive.json': 'another.primitive', + '/path/to/some/other/primitive.json': 'some.other.primitive' + } + + +def test__match_no_match(): + annotation = { + 'name': 'a.primitive', + } + + matches = discovery._match(annotation, 'key', 'value') + + assert not matches + + +def test__match_root(): + annotation = { + 'name': 'a.primitive', + 'key': 'value' + } + + matches = discovery._match(annotation, 'key', 'value') + + assert matches + + +def test__match_sublevel(): + annotation = { + 'name': 'a.primitive', + 'some': { + 'sublevel': { + 'key': 'value' + } + } + } + + matches = discovery._match(annotation, 'some.sublevel.key', 'value') + + assert matches + + +def test__match_list_no_match(): + annotation = { + 'name': 'a.primitive', + 'key': [ + 'another_value' + 'yet_another_value' + ] + } + + matches = discovery._match(annotation, 'key', 'value') + + assert not matches + + +def test__match_list(): + annotation = { + 'name': 'a.primitive', + 'key': [ + 'value', + 'another_value' + ] + } + + matches = discovery._match(annotation, 'key', 'value') + + assert matches + + +def test__match_dict(): + annotation = { + 'name': 'a.primitive', + 'key': { + 'value': 'subvalue', + 'another_value': 'another_subvalue' + } + } + + matches = discovery._match(annotation, 'key', 'value') + + assert matches + + +def test__match_multiple_keys(): + annotation = { + 'name': 'a.primitive', + 'key': 'value' + } + + matches = discovery._match(annotation, 'key', ['value', 'another_value']) + + assert matches + + +@patch('mlblocks.discovery._search_annotations') +def test__get_annotations_list(search_annotations_mock): + search_annotations_mock.return_value = { + '/path/to/a/classifier.primitive.json': 'classifier.primitive', + '/path/to/a/regressor.primitive.json': 'regressor.primitive', + } + + loader = Mock() + loader.side_effect = [ + { + 'name': 'classifier.primitive', + 'classifiers': { + 'type': 'estimator', + 'subtype': 'classifier', + } + }, + { + 'name': 'regressor.primitive', + 'classifiers': { + 'type': 'estimator', + 'subtype': 'regressor', + } + } + ] + + filters = { + 'classifiers.subtype': 'regressor' + } + annotations = discovery._get_annotations_list(['/a/path'], loader, 'pattern', filters) + + assert annotations == ['regressor.primitive'] + search_annotations_mock.assert_called_once_with('/a/path', re.compile('pattern')) + + +@patch('mlblocks.discovery._get_annotations_list') +@patch('mlblocks.discovery.get_primitives_paths') +def test_find_primitives(gpp_mock, gal_mock): + primitives = discovery.find_primitives('pattern') + + gal_mock.assert_called_once_with( + gpp_mock.return_value, discovery.load_primitive, 'pattern', dict()) + + assert primitives == gal_mock.return_value + + +@patch('mlblocks.discovery._get_annotations_list') +@patch('mlblocks.discovery.get_pipelines_paths') +def test_find_primitives(gpp_mock, gal_mock): + primitives = discovery.find_pipelines('pattern', {'a': 'filter'}) + + gal_mock.assert_called_once_with( + gpp_mock.return_value, discovery.load_pipeline, 'pattern', {'a': 'filter'}) + + assert primitives == gal_mock.return_value From 1ca63500c1c86fc973005ad2d3c2a768f685f13a Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 6 Aug 2019 21:08:44 +0200 Subject: [PATCH 046/160] rename method --- tests/test_discovery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 07fc0753..bf148571 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -384,7 +384,7 @@ def test_find_primitives(gpp_mock, gal_mock): @patch('mlblocks.discovery._get_annotations_list') @patch('mlblocks.discovery.get_pipelines_paths') -def test_find_primitives(gpp_mock, gal_mock): +def test_find_pipelines(gpp_mock, gal_mock): primitives = discovery.find_pipelines('pattern', {'a': 'filter'}) gal_mock.assert_called_once_with( From ec4609f45929defff7e64a09c81a866810774d4f Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 7 Aug 2019 12:10:34 +0200 Subject: [PATCH 047/160] Add docstrings and rename a few methods --- mlblocks/discovery.py | 151 ++++++++++++++++++++++++++++++++++++---- tests/test_discovery.py | 26 ++++--- 2 files changed, 149 insertions(+), 28 deletions(-) diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py index db7ba40d..9a1dbef5 100644 --- a/mlblocks/discovery.py +++ b/mlblocks/discovery.py @@ -297,6 +297,29 @@ def load_pipeline(name): def _search_annotations(base_path, pattern, parts=None): + """Search for annotations within the given path. + + If the indicated path has subfolders, search recursively within them. + + If a pattern is given, return only the annotations whose name + matches the pattern. + + Args: + base_path (str): + path to the folder to be searched for annotations. + pattern (str): + Regular expression to search in the annotation names. + parts (list): + Optional. List containing the parent folders that are also part + of the annotation name. Used during recursion to be able to + build the final annotation name before returning it. + + Returns: + dict: + dictionary containing paths as keys and annotation names as + values. + """ + pattern = re.compile(pattern) annotations = dict() parts = parts or list() if os.path.exists(base_path): @@ -312,24 +335,70 @@ def _search_annotations(base_path, pattern, parts=None): return annotations -def _match(annotation, key, value): - if '.' in key: - name, key = key.split('.', 1) - part = annotation.get(name) or dict() - return _match(part, key, value) +def _match(annotation, key, values): + """Check if the anotation has the key and it matches any of the values. + + If the given key is not found but it contains dots, split by the dots + and consider each part a sublevel in the annotation. + + If the key value within the annotation is a list or a dict, check + whether any of the given values is contained within it instead of + checking for equality. + + Args: + annotation (dict): + Dictionary annotation. + key (str): + Key to search within the annoation. It can contain dots to + separated nested subdictionary levels within the annotation. + values (object or list): + Value or list of values to search for. - annotation_value = annotation.get(key) - if not isinstance(annotation_value, type(value)): + Returns: + bool: + whether there is a match or not. + """ + if not isinstance(values, list): + values = [values] + + if key not in annotation: + if '.' in key: + name, key = key.split('.', 1) + part = annotation.get(name) or dict() + return _match(part, key, values) + else: + return False + + annotation_value = annotation[key] + + for value in values: if isinstance(annotation_value, (list, dict)): return value in annotation_value - elif isinstance(value, (list, dict)): - return annotation_value in value + elif annotation_value == value: + return True - return annotation_value == value + return False -def _get_annotations_list(paths, loader, pattern, filters): - pattern = re.compile(pattern) +def _find_annotations(paths, loader, pattern, filters): + """Find matching annotations within the given paths. + + Math annotations by both name pattern and filters. + + Args: + paths (list): + List of paths to search annotations in. + loader (callable): + Function to use to load the annotation contents. + pattern (str): + Pattern to match against the annotation name. + filters (dict): + Dictionary containing key/value filters. + + Returns: + list: + names of the matching annotations. + """ annotations = dict() for base_path in paths: annotations.update(_search_annotations(base_path, pattern)) @@ -348,10 +417,64 @@ def _get_annotations_list(paths, loader, pattern, filters): def find_primitives(pattern='', filters=None): + """Find primitives by name and filters. + + If a patter is given, only the primitives whose name matches + the pattern will be returned. + + If filters are given, they should be a dictionary containing key/value + filters that will have to be matched within the primitive annotation + for it to be included in the results. + + If the given key is not found but it contains dots, split by the dots + and consider each part a sublevel in the annotation. + + If the key value within the annotation is a list or a dict, check + whether any of the given values is contained within it instead of + checking for equality. + + Args: + pattern (str): + Regular expression to match agains the primitive names. + filters (dict): + Dictionary containing the filters to apply over the matchin + primitives. + + Returns: + list: + Names of the matching primitives. + """ filters = filters or dict() - return _get_annotations_list(get_primitives_paths(), load_primitive, pattern, filters) + return _find_annotations(get_primitives_paths(), load_primitive, pattern, filters) def find_pipelines(pattern='', filters=None): + """Find pipelines by name and filters. + + If a patter is given, only the pipelines whose name matches + the pattern will be returned. + + If filters are given, they should be a dictionary containing key/value + filters that will have to be matched within the pipeline annotation + for it to be included in the results. + + If the given key is not found but it contains dots, split by the dots + and consider each part a sublevel in the annotation. + + If the key value within the annotation is a list or a dict, check + whether any of the given values is contained within it instead of + checking for equality. + + Args: + pattern (str): + Regular expression to match agains the pipeline names. + filters (dict): + Dictionary containing the filters to apply over the matchin + pipelines. + + Returns: + list: + Names of the matching pipelines. + """ filters = filters or dict() - return _get_annotations_list(get_pipelines_paths(), load_pipeline, pattern, filters) + return _find_annotations(get_pipelines_paths(), load_pipeline, pattern, filters) diff --git a/tests/test_discovery.py b/tests/test_discovery.py index bf148571..dc3eca87 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -2,7 +2,6 @@ import json import os -import re import tempfile import uuid from unittest.mock import Mock, call, patch @@ -239,8 +238,7 @@ def test__search_annotations(os_mock): False ] - pattern = re.compile('other') - annotations = discovery._search_annotations('/path/to', pattern) + annotations = discovery._search_annotations('/path/to', 'other') assert annotations == { '/path/to/another.primitive.json': 'another.primitive', @@ -338,7 +336,7 @@ def test__match_multiple_keys(): @patch('mlblocks.discovery._search_annotations') -def test__get_annotations_list(search_annotations_mock): +def test__find_annotations(search_annotations_mock): search_annotations_mock.return_value = { '/path/to/a/classifier.primitive.json': 'classifier.primitive', '/path/to/a/regressor.primitive.json': 'regressor.primitive', @@ -365,29 +363,29 @@ def test__get_annotations_list(search_annotations_mock): filters = { 'classifiers.subtype': 'regressor' } - annotations = discovery._get_annotations_list(['/a/path'], loader, 'pattern', filters) + annotations = discovery._find_annotations(['/a/path'], loader, 'pattern', filters) assert annotations == ['regressor.primitive'] - search_annotations_mock.assert_called_once_with('/a/path', re.compile('pattern')) + search_annotations_mock.assert_called_once_with('/a/path', 'pattern') -@patch('mlblocks.discovery._get_annotations_list') +@patch('mlblocks.discovery._find_annotations') @patch('mlblocks.discovery.get_primitives_paths') -def test_find_primitives(gpp_mock, gal_mock): +def test_find_primitives(gpp_mock, fa_mock): primitives = discovery.find_primitives('pattern') - gal_mock.assert_called_once_with( + fa_mock.assert_called_once_with( gpp_mock.return_value, discovery.load_primitive, 'pattern', dict()) - assert primitives == gal_mock.return_value + assert primitives == fa_mock.return_value -@patch('mlblocks.discovery._get_annotations_list') +@patch('mlblocks.discovery._find_annotations') @patch('mlblocks.discovery.get_pipelines_paths') -def test_find_pipelines(gpp_mock, gal_mock): +def test_find_pipelines(gpp_mock, fa_mock): primitives = discovery.find_pipelines('pattern', {'a': 'filter'}) - gal_mock.assert_called_once_with( + fa_mock.assert_called_once_with( gpp_mock.return_value, discovery.load_pipeline, 'pattern', {'a': 'filter'}) - assert primitives == gal_mock.return_value + assert primitives == fa_mock.return_value From 69a30cafcae4a776e8d1ed09c41116d1c82d2bee Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 7 Aug 2019 12:36:51 +0200 Subject: [PATCH 048/160] Update README --- README.md | 91 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 72 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 5b4f2519..2d49d8a6 100644 --- a/README.md +++ b/README.md @@ -13,19 +13,14 @@ Pipelines and Primitives for Machine Learning and Data Science.

-[![PyPi][pypi-img]][pypi-url] -[![Travis][travis-img]][travis-url] -[![CodeCov][codecov-img]][codecov-url] - -[pypi-img]: https://img.shields.io/pypi/v/mlblocks.svg -[pypi-url]: https://pypi.python.org/pypi/mlblocks -[travis-img]: https://travis-ci.org/HDI-Project/MLBlocks.svg?branch=master -[travis-url]: https://travis-ci.org/HDI-Project/MLBlocks -[codecov-img]: https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg -[codecov-url]: https://codecov.io/gh/HDI-Project/MLBlocks +[![PyPi](https://img.shields.io/pypi/v/mlblocks.svg)](https://pypi.python.org/pypi/mlblocks) +[![Travis](https://travis-ci.org/HDI-Project/MLBlocks.svg?branch=master)](https://travis-ci.org/HDI-Project/MLBlocks) +[![CodeCov](https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/HDI-Project/MLBlocks) +[![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks) * Free software: MIT license * Documentation: https://HDI-Project.github.io/MLBlocks +- Homepage: https://github.com/HDI-Project/MLBlocks # Overview @@ -44,24 +39,82 @@ Features include: outputs per primitive. * Easy save and load Pipelines using JSON Annotations. -# Installation +# Install + +## Requirements + +**MLBlocks** has been developed and tested on [Python 3.5 and 3.6](https://www.python.org/downloads/) + +Also, although it is not strictly required, the usage of a +[virtualenv](https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid +interfering with other software installed in the system where **MLBlocks** is run. + +These are the minimum commands needed to create a virtualenv using python3.6 for **MLBlocks**: + +```bash +pip install virtualenv +virtualenv -p $(which python3.6) mlblocks-venv +``` -The simplest and recommended way to install MLBlocks is using `pip`: +Afterwards, you have to execute this command to have the virtualenv activated: + +```bash +source mlblocks-venv/bin/activate +``` + +Remember about executing it every time you start a new console to work on **MLBlocks**! + +## Install with pip + +After creating the virtualenv and activating it, we recommend using +[pip](https://pip.pypa.io/en/stable/) in order to install **MLBlocks**: ```bash pip install mlblocks ``` -Alternatively, you can also clone the repository and install it from sources +This will pull and install the latest stable release from [PyPi](https://pypi.org/). + +## Install from source + +Alternatively, with your virtualenv activated, you can clone the repository and install it from +source by running `make install` on the `stable` branch: ```bash git clone git@github.com:HDI-Project/MLBlocks.git cd MLBlocks +git checkout stable make install ``` -For development, you can use `make install-develop` instead in order to install all -the required dependencies for testing and code linting. +## Install for Development + +If you want to contribute to the project, a few more steps are required to make the project ready +for development. + +First, please head to [the GitHub page of the project](https://github.com/HDI-Project/MLBlocks) +and make a fork of the project under you own username by clicking on the **fork** button on the +upper right corner of the page. + +Afterwards, clone your fork and create a branch from master with a descriptive name that includes +the number of the issue that you are going to work on: + +```bash +git clone git@github.com:{your username}/MLBlocks.git +cd MLBlocks +git branch issue-xx-cool-new-feature master +git checkout issue-xx-cool-new-feature +``` + +Finally, install the project with the following command, which will install some additional +dependencies for code linting and testing. + +```bash +make install-develop +``` + +Make sure to use them regularly while developing by running the commands `make lint` and `make test`. + ## MLPrimitives @@ -75,12 +128,12 @@ with this command: pip install mlprimitives ``` -# Usage Example +# Quickstart Below there is a short example about how to use MLBlocks to create a simple pipeline, fit it using demo data and use it to make predictions. -Please make sure to having installed [MLPrimitives](https://github.com/HDI-Project/MLPrimitives) +Please make sure to also having installed [MLPrimitives](https://github.com/HDI-Project/MLPrimitives) before following it. For advance usage and more detailed explanation about each component, please have a look @@ -153,7 +206,7 @@ its `get_hyperparameters` method: } ``` -### Making predictions +## Making predictions Once we have created the pipeline with the desired hyperparameters we can fit it and then use it to make predictions on new data. @@ -180,7 +233,7 @@ to obtain predictions from the pipeline. array([3, 2, 1, ..., 1, 1, 2]) ``` -## What's Next? +# What's Next? If you want to learn more about how to tune the pipeline hyperparameters, save and load the pipelines using JSON annotations or build complex multi-branched pipelines, please From 6324d3ffaad0fc45ad00fabc9c43de0b6e92ebf0 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 7 Aug 2019 12:38:25 +0200 Subject: [PATCH 049/160] Update README title --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 2d49d8a6..19f740ed 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,6 @@ An open source project from Data to AI Lab at MIT.

- -

“MLBlocks”

@@ -22,7 +20,7 @@ Pipelines and Primitives for Machine Learning and Data Science. * Documentation: https://HDI-Project.github.io/MLBlocks - Homepage: https://github.com/HDI-Project/MLBlocks -# Overview +# MLBlocks MLBlocks is a simple framework for composing end-to-end tunable Machine Learning Pipelines by seamlessly combining tools from any python library with a simple, common and uniform interface. From c2771588f0d65e7ad3fdde9c71d3979ecd3dca3a Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 7 Aug 2019 13:05:45 +0200 Subject: [PATCH 050/160] Update dependencies --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 608e481d..4c371761 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ tests_require = [ 'pytest>=3.4.2', 'pytest-cov>=2.6.0', - 'mlprimitives>=0.1.3,<0.2', + 'mlprimitives>=0.2,<0.3', 'urllib3>=1.20,<1.25', 'setuptools>=41.0.0', 'numpy<1.17', From b65d7c77fd0b4275fb287e77867a8a43471ee3b3 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 7 Aug 2019 13:34:23 +0200 Subject: [PATCH 051/160] Fix docs quickstart --- Makefile | 2 +- docs/getting_started/quickstart.rst | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 6266033f..e54e1362 100644 --- a/Makefile +++ b/Makefile @@ -112,7 +112,7 @@ test: ## run tests quickly with the default Python .PHONY: test-all test-all: ## run tests on every Python version with tox - tox + tox -r .PHONY: coverage coverage: ## check code coverage quickly with the default Python diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst index 2887da05..31be89ee 100644 --- a/docs/getting_started/quickstart.rst +++ b/docs/getting_started/quickstart.rst @@ -24,6 +24,7 @@ them to the `MLPipeline class`_: from mlblocks import MLPipeline primitives = [ + 'mlprimitives.custom.feature_extraction.CategoricalEncoder', 'mlprimitives.custom.feature_extraction.StringVectorizer', 'sklearn.ensemble.RandomForestClassifier', ] From c189a7f267613e71ba8632c1b5ca80bf1be79043 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 7 Aug 2019 14:20:24 +0200 Subject: [PATCH 052/160] Add metadata attribute --- mlblocks/mlblock.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index c32f978a..5727384e 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -146,22 +146,22 @@ def _get_tunable(cls, hyperparameters, init_params): def __init__(self, name, **kwargs): self.name = name - primitive = load_primitive(name) + self.metadata = load_primitive(name) - self.primitive = import_object(primitive['primitive']) + self.primitive = import_object(self.metadata['primitive']) - self._fit = primitive.get('fit', dict()) + self._fit = self.metadata.get('fit', dict()) self.fit_args = self._fit.get('args', []) self.fit_method = self._fit.get('method') - self._produce = primitive['produce'] + self._produce = self.metadata['produce'] self.produce_args = self._produce['args'] self.produce_output = self._produce['output'] self.produce_method = self._produce.get('method') self._class = bool(self.produce_method) - hyperparameters = primitive.get('hyperparameters', dict()) + hyperparameters = self.metadata.get('hyperparameters', dict()) init_params, fit_params, produce_params = self._extract_params(kwargs, hyperparameters) self._hyperparameters = init_params From c78c1373f03aa82ef73cbfcffa2d48f051eb4cbf Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 8 Aug 2019 15:42:36 +0200 Subject: [PATCH 053/160] Allow passing fit and produce args as init params --- mlblocks/mlblock.py | 25 ++++++++------- mlblocks/mlpipeline.py | 37 +++++++++++----------- tests/features/test_fit_predicr_args.py | 42 +++++++++++++++++++++++++ tests/test_mlblock.py | 25 ++++++++++++--- tests/test_mlpipeline.py | 1 - 5 files changed, 96 insertions(+), 34 deletions(-) create mode 100644 tests/features/test_fit_predicr_args.py diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index 5727384e..db24caa5 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -13,8 +13,11 @@ def import_object(object_name): """Import an object from its Fully Qualified Name.""" - package, name = object_name.rsplit('.', 1) - return getattr(importlib.import_module(package), name) + if isinstance(object_name, str): + package, name = object_name.rsplit('.', 1) + return getattr(importlib.import_module(package), name) + + return object_name class MLBlock(): @@ -27,7 +30,7 @@ class MLBlock(): Attributes: name (str): - Name given to this MLBlock. + Primitive name. metadata (dict): Additional information about this primitive primitive (object): @@ -46,8 +49,8 @@ class MLBlock(): function. Args: - name (str): - Name given to this MLBlock. + primitive (str or dict): + primitive name or primitive dictionary. **kwargs: Any additional arguments that will be used as hyperparameters or passed to the ``fit`` or ``produce`` methods. @@ -143,10 +146,12 @@ def _get_tunable(cls, hyperparameters, init_params): return tunable - def __init__(self, name, **kwargs): - self.name = name + def __init__(self, primitive, **kwargs): + if isinstance(primitive, str): + primitive = load_primitive(primitive) - self.metadata = load_primitive(name) + self.metadata = primitive + self.name = primitive['name'] self.primitive = import_object(self.metadata['primitive']) @@ -252,11 +257,9 @@ def _get_method_kwargs(self, kwargs, method_args): if name in kwargs: value = kwargs[name] - elif 'default' in arg: value = arg['default'] - - else: + elif arg.get('required', True): raise TypeError("missing expected argument '{}'".format(name)) method_kwargs[keyword] = value diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index e19a68ee..14e5ce67 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -87,16 +87,21 @@ def _build_blocks(self): block_names_count = Counter() for primitive in self.primitives: + if isinstance(primitive, str): + primitive_name = primitive + else: + primitive_name = primitive['name'] + try: - block_names_count.update([primitive]) - block_count = block_names_count[primitive] - block_name = '{}#{}'.format(primitive, block_count) + block_names_count.update([primitive_name]) + block_count = block_names_count[primitive_name] + block_name = '{}#{}'.format(primitive_name, block_count) block_params = self.init_params.get(block_name, dict()) if not block_params: - block_params = self.init_params.get(primitive, dict()) + block_params = self.init_params.get(primitive_name, dict()) if block_params and block_count > 1: LOGGER.warning(("Non-numbered init_params are being used " - "for more than one block %s."), primitive) + "for more than one block %s."), primitive_name) block = MLBlock(primitive, **block_params) blocks[block_name] = block @@ -330,10 +335,6 @@ def _get_block_args(self, block_name, block_args, context): if variable in context: kwargs[name] = context[variable] - elif 'default' in arg: - kwargs[name] = arg['default'] - elif arg.get('required', True): - raise ValueError('Input variable {} not found in context'.format(variable)) return kwargs @@ -517,11 +518,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): the value of that variable from the context will extracted and returned after the produce method of that block has been called. """ - context = { - 'X': X, - 'y': y - } - context.update(kwargs) + context = kwargs.copy() + if X is not None: + context['X'] = X + + if y is not None: + context['y'] = y output_block, output_variable = self._get_output_spec(output_) last_block_name = self._get_block_name(-1) @@ -624,10 +626,9 @@ def predict(self, X=None, output_=None, start_=None, **kwargs): the value of that variable from the context will extracted and returned after the produce method of that block has been called. """ - context = { - 'X': X - } - context.update(kwargs) + context = kwargs.copy() + if X is not None: + context['X'] = X output_block, output_variable = self._get_output_spec(output_) diff --git a/tests/features/test_fit_predicr_args.py b/tests/features/test_fit_predicr_args.py new file mode 100644 index 00000000..af4c0aea --- /dev/null +++ b/tests/features/test_fit_predicr_args.py @@ -0,0 +1,42 @@ +from mlblocks.mlpipeline import MLPipeline + + +def test_fit_predict_args_in_init(): + + def add(a, b): + return a + b + + primitive = { + 'name': 'add', + 'primitive': add, + 'produce': { + 'args': [ + { + 'name': 'a', + 'type': 'float', + }, + { + 'name': 'b', + 'type': 'float', + }, + ], + 'output': [ + { + 'type': 'float', + 'name': 'out' + } + ] + } + } + + primitives = [primitive] + init_params = { + 'add': { + 'b': 10 + } + } + pipeline = MLPipeline(primitives, init_params=init_params) + + out = pipeline.predict(a=3) + + assert out == 13 diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py index 16f1c6d1..b4dbc637 100644 --- a/tests/test_mlblock.py +++ b/tests/test_mlblock.py @@ -323,6 +323,7 @@ def test__get_tunable_condition_match_null(self): @patch('mlblocks.mlblock.load_primitive') def test___init__(self, load_primitive_mock, import_object_mock, set_hps_mock): load_primitive_mock.return_value = { + 'name': 'a_primitive_name', 'primitive': 'a_primitive_name', 'produce': { 'args': [ @@ -335,9 +336,22 @@ def test___init__(self, load_primitive_mock, import_object_mock, set_hps_mock): } } - mlblock = MLBlock('given_primitive_name', argument='value') + mlblock = MLBlock('a_primitive_name', argument='value') - assert mlblock.name == 'given_primitive_name' + assert mlblock.metadata == { + 'name': 'a_primitive_name', + 'primitive': 'a_primitive_name', + 'produce': { + 'args': [ + { + 'name': 'argument' + } + ], + 'output': [ + ] + } + } + assert mlblock.name == 'a_primitive_name' assert mlblock.primitive == import_object_mock.return_value assert mlblock._fit == dict() assert mlblock.fit_args == list() @@ -370,6 +384,7 @@ def test___init__(self, load_primitive_mock, import_object_mock, set_hps_mock): @patch('mlblocks.mlblock.load_primitive') def test___str__(self, load_primitive_mock, import_object_mock): load_primitive_mock.return_value = { + 'name': 'a_primitive_name', 'primitive': 'a_primitive_name', 'produce': { 'args': [], @@ -377,15 +392,16 @@ def test___str__(self, load_primitive_mock, import_object_mock): } } - mlblock = MLBlock('given_primitive_name') + mlblock = MLBlock('a_primitive_name') - assert str(mlblock) == 'MLBlock - given_primitive_name' + assert str(mlblock) == 'MLBlock - a_primitive_name' @patch('mlblocks.mlblock.import_object') @patch('mlblocks.mlblock.load_primitive') def test_get_tunable_hyperparameters(self, load_primitive_mock, import_object_mock): """get_tunable_hyperparameters has to return a copy of the _tunables attribute.""" load_primitive_mock.return_value = { + 'name': 'a_primitive_name', 'primitive': 'a_primitive_name', 'produce': { 'args': [], @@ -433,6 +449,7 @@ def primitive(a_list_param): io_mock.return_value = primitive lp_mock.return_value = { + 'name': 'a_primitive', 'primitive': 'a_primitive', 'produce': { 'args': [], diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index 2011f5ae..327387f5 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -270,7 +270,6 @@ def test__get_block_args(self): expected = { 'arg_1': 'arg_1_value', - 'arg_2': 'arg_2_value', 'arg_3': 'arg_3_value', } assert args == expected From badd7f176e4d5df9b89d0e083224a0c33257c807 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 12 Aug 2019 13:10:31 +0200 Subject: [PATCH 054/160] Add release notest for v0.3.2 --- HISTORY.md | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index e6b14674..f2654353 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,19 +1,23 @@ Changelog ========= +0.3.2 - 2019-08-12 +------------------ + +* Allow passing fit and produce arguments as `init_params` - [Issue #96](https://github.com/HDI-Project/MLBlocks/issues/96) by @csala +* Support optional fit and produce args and arg defaults - [Issue #95](https://github.com/HDI-Project/MLBlocks/issues/95) by @csala +* Isolate primitives from their hyperparameters dictionary - [Issue #94](https://github.com/HDI-Project/MLBlocks/issues/94) by @csala +* Add functions to explore the available primitives and pipelines - [Issue #90](https://github.com/HDI-Project/MLBlocks/issues/90) by @csala +* Add primitive caching New Feature - [Issue #22](https://github.com/HDI-Project/MLBlocks/issues/22) by @csala + 0.3.1 - Pipelines Discovery --------------------------- -* Support flat hyperparameter dictionaries - [Issue #92](https://github.com/HDI-Project/MLBlocks/issues/92) by @csala -* Load pipelines by name and register them as `entry_points` - [Issue #88](https://github.com/HDI-Project/MLBlocks/issues/88) by @csala -* Implement partial re-fit - [Issue #61](https://github.com/HDI-Project/MLBlocks/issues/61) by @csala -* Move argument parsing to MLBlock - [Issue #86](https://github.com/HDI-Project/MLBlocks/issues/86) by @csala -* Allow getting intermediate outputs - [Issue #58](https://github.com/HDI-Project/MLBlocks/issues/58) by @csala +* Support flat hyperparameter dictionaries - [Issue #92](https://github.com/HDI-Project/MLBlocks/issues/92) by @csala +* Load pipelines by name and register them as `entry_points` - [Issue #88](https://github.com/HDI-Project/MLBlocks/issues/88) by @csala +* Implement partial re-fit -[Issue #61](https://github.com/HDI-Project/MLBlocks/issues/61) by @csala +* Move argument parsing to MLBlock - [Issue #86](https://github.com/HDI-Project/MLBlocks/issues/86) by @csala +* Allow getting intermediate outputs - [Issue #58](https://github.com/HDI-Project/MLBlocks/issues/58) by @csala 0.3.0 - New Primitives Discovery -------------------------------- From a094e9f1f7543758a058c8dbf3cb443854cfcf4d Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 12 Aug 2019 13:11:23 +0200 Subject: [PATCH 055/160] =?UTF-8?q?Bump=20version:=200.3.2-dev=20=E2=86=92?= =?UTF-8?q?=200.3.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index b528aefe..9df5b210 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.3.2-dev' +__version__ = '0.3.2' __all__ = [ 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index 1967b27b..97bb08a0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.2-dev +current_version = 0.3.2 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? diff --git a/setup.py b/setup.py index 4c371761..3514f943 100644 --- a/setup.py +++ b/setup.py @@ -100,6 +100,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/MLBlocks', - version='0.3.2-dev', + version='0.3.2', zip_safe=False, ) From 14446f71c60213de2c3206e4beae25c5fa0f5d0e Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 12 Aug 2019 13:11:39 +0200 Subject: [PATCH 056/160] =?UTF-8?q?Bump=20version:=200.3.2=20=E2=86=92=200?= =?UTF-8?q?.3.3-dev?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 9df5b210..7f6e1eaf 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.3.2' +__version__ = '0.3.3-dev' __all__ = [ 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index 97bb08a0..a9051663 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.2 +current_version = 0.3.3-dev commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? diff --git a/setup.py b/setup.py index 3514f943..870d1276 100644 --- a/setup.py +++ b/setup.py @@ -100,6 +100,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/MLBlocks', - version='0.3.2', + version='0.3.3-dev', zip_safe=False, ) From 65610157d2cea9d42545587b36ef4628d10c2893 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 12 Aug 2019 13:13:19 +0200 Subject: [PATCH 057/160] Typo in the release notes --- HISTORY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index f2654353..c3b00ce0 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -8,7 +8,7 @@ Changelog * Support optional fit and produce args and arg defaults - [Issue #95](https://github.com/HDI-Project/MLBlocks/issues/95) by @csala * Isolate primitives from their hyperparameters dictionary - [Issue #94](https://github.com/HDI-Project/MLBlocks/issues/94) by @csala * Add functions to explore the available primitives and pipelines - [Issue #90](https://github.com/HDI-Project/MLBlocks/issues/90) by @csala -* Add primitive caching New Feature - [Issue #22](https://github.com/HDI-Project/MLBlocks/issues/22) by @csala +* Add primitive caching - [Issue #22](https://github.com/HDI-Project/MLBlocks/issues/22) by @csala 0.3.1 - Pipelines Discovery --------------------------- From 8c03242cb648a68f997e3ee0b3b6557623bd3b35 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 3 Sep 2019 14:34:26 +0200 Subject: [PATCH 058/160] Advanced intermediate outputs --- mlblocks/mlpipeline.py | 430 ++++++++++++++----------- tests/features/test_partial_outputs.py | 15 +- tests/test_mlpipeline.py | 43 ++- 3 files changed, 280 insertions(+), 208 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 14e5ce67..b02561fe 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -5,6 +5,7 @@ import json import logging from collections import Counter, OrderedDict, defaultdict +from copy import deepcopy import numpy as np @@ -72,6 +73,11 @@ class MLPipeline(): given when stored in the context dictionary. This allows storing the output of different primitives in different variables, even if the primitive output name is the same one. + outputs (dict): + dictionary containing lists of output variables associated to a name. + verbose (bool): + whether to log the exceptions that occur when running the pipeline before + raising them or not. """ def _get_tunable_hyperparameters(self): @@ -114,7 +120,6 @@ def _build_blocks(self): @staticmethod def _get_pipeline_dict(pipeline, primitives): - if isinstance(pipeline, dict): return pipeline @@ -136,18 +141,50 @@ def _get_pipeline_dict(pipeline, primitives): return dict() + def _get_block_outputs(self, block_name): + """Get the list of output variables for the given block.""" + block = self.blocks[block_name] + outputs = deepcopy(block.produce_output) + for output in outputs: + output['variable'] = '{}.{}'.format(block_name, output['name']) + + return outputs + + def _get_outputs(self, pipeline, outputs): + """Get the output definitions from the pipeline dictionary. + + If the ``"default"`` entry does not exist, it is built using the + outputs from the last block in the pipeline. + """ + outputs = outputs or pipeline.get('outputs') + if outputs is None: + outputs = dict() + + if 'default' not in outputs: + outputs['default'] = self._get_block_outputs(self._last_block_name) + + return outputs + + def _get_block_name(self, index): + """Get the name of the block in the ``index`` position.""" + return list(self.blocks.keys())[index] + def __init__(self, pipeline=None, primitives=None, init_params=None, - input_names=None, output_names=None): + input_names=None, output_names=None, outputs=None, verbose=True): pipeline = self._get_pipeline_dict(pipeline, primitives) self.primitives = primitives or pipeline['primitives'] self.init_params = init_params or pipeline.get('init_params', dict()) self.blocks = self._build_blocks() + self._last_block_name = self._get_block_name(-1) self.input_names = input_names or pipeline.get('input_names', dict()) self.output_names = output_names or pipeline.get('output_names', dict()) + self.outputs = self._get_outputs(pipeline, outputs) + self.verbose = verbose + tunable = pipeline.get('tunable_hyperparameters') if tunable is not None: self._tunable_hyperparameters = tunable @@ -158,6 +195,122 @@ def __init__(self, pipeline=None, primitives=None, init_params=None, if hyperparameters: self.set_hyperparameters(hyperparameters) + def _get_str_output(self, output): + """Get the outputs that correspond to the str specification.""" + if output in self.outputs: + return self.outputs[output] + elif output in self.blocks: + return self._get_block_outputs(output) + elif '.' in output: + block_name, variable_name = output.rsplit('.', 1) + block = self.blocks.get(block_name) + if not block: + raise ValueError('Invalid block name: {}'.format(block_name)) + + for variable in block.produce_output: + if variable['name'] == variable_name: + return [{'name': variable_name, 'variable': output}] + + raise ValueError('Block {} has no output {}'.format(block_name, variable_name)) + + raise ValueError('Invalid Output Specification: {}'.format(output)) + + def get_outputs(self, outputs='default'): + """Get the list of output variables that correspond to the specified outputs. + + Outputs specification can either be a single string, a single integer, or a + list of strings and integers. + + If strings are given, they can either be one of the named outputs that have + been specified on the pipeline definition or the name of a block, including the + counter number at the end, or a full variable specification following the format + ``{block-name}.{variable-name}``. + + Alternatively, integers can be passed as indexes of the blocks from which to get + the outputs. + + If output specifications that resolve to multiple output variables are given, + such as the named outputs or block names, all the variables are concatenated + together, in order, in a single variable list. + + Args: + outputs (str, int or list[str or int]): + Single or list of output specifications. + + Returns: + list: + List of dictionaries specifying all the output variables. Each + dictionary contains the entries ``name`` and ``variable``, as + well as any other metadata that may have been included in the + pipeline outputs or block produce outputs specification. + + Raises: + ValueError: + If an output specification is not valid. + TypeError: + If the type of a specification is not an str or an int. + """ + if not isinstance(outputs, (list, tuple)): + outputs = (outputs, ) + + computed = list() + for output in outputs: + if isinstance(output, str): + computed.extend(self._get_str_output(output)) + elif isinstance(output, int): + block_name = self._get_block_name(output) + computed.extend(self._get_block_outputs(block_name)) + else: + raise TypeError('Output Specification can only be str or int') + + return computed + + def get_output_names(self, outputs='default'): + """Get the names of the outputs that correspond to the given specification. + + The indicated outputs will be resolved and the names of the output variables + will be returned as a single list. + + Args: + outputs (str, int or list[str or int]): + Single or list of output specifications. + + Returns: + list: + List of variable names + + Raises: + ValueError: + If an output specification is not valid. + TypeError: + If the type of a specification is not an str or an int. + """ + outputs = self.get_outputs(outputs) + return [output['name'] for output in outputs] + + def get_output_variables(self, outputs='default'): + """Get the list of variable specifications of the given outputs. + + The indicated outputs will be resolved and their variables specifications + will be returned as a single list. + + Args: + outputs (str, int or list[str or int]): + Single or list of output specifications. + + Returns: + list: + List of variable specifications. + + Raises: + ValueError: + If an output specification is not valid. + TypeError: + If the type of a specification is not an str or an int. + """ + outputs = self.get_outputs(outputs) + return [output['variable'] for output in outputs] + @staticmethod def _flatten_dict(hyperparameters): return { @@ -361,96 +514,48 @@ def _extract_outputs(self, block_name, outputs, block_outputs): return output_dict - def _get_block_name(self, index): - """Get the name of the block in the ``index`` position.""" - return list(self.blocks.keys())[index] - - def _get_output_spec(self, output): - """Parse the output specification and get a block name and a variable name. - - The output specification can be of two types: int and str. - - If it is an integer, it is interpreted as a block index, and the variable name - is considered to be ``None``, which means that the whole context will be returned. - - If it is a string, it can be interpreted in three ways: - - * **block name**: If the string matches a block name exactly, including - its hash and counter number ``#n`` at the end, the whole context will be - returned after that block is produced. - * **variable_name**: If the string does not match any block name and does - not contain any dot characted, ``'.'``, it will be considered a variable - name. In this case, the indicated variable will be extracted from the - context and returned after the last block has been produced. - * **block_name + variable_name**: If the complete string does not match a - block name but it contains at least one dot, ``'.'``, it will be split - in two parts on the last dot. If the first part of the string matches a - block name exactly, the second part of the string will be considered a - variable name, assuming the format ``{block_name}.{variable_name}``, and - the indicated variable will be extracted from the context and returned - after the block has been produced. Otherwise, if the extracted - ``block_name`` does not match a block name exactly, a ``ValueError`` - will be raised. + def _update_outputs(self, block_name, output_variables, outputs, outputs_dict): + """Set the requested block outputs into the outputs list in the right place.""" + for key, value in outputs_dict.items(): + variable_name = '{}.{}'.format(block_name, key) + if variable_name in output_variables: + index = output_variables.index(variable_name) + outputs[index] = deepcopy(value) + + def _fit_block(self, block, block_name, context): + """Get the block args from the context and fit the block.""" + LOGGER.debug("Fitting block %s", block_name) + try: + fit_args = self._get_block_args(block_name, block.fit_args, context) + block.fit(**fit_args) + except Exception: + if self.verbose: + LOGGER.exception("Exception caught fitting MLBlock %s", block_name) - Args: - output (str or int): - Output specification as either a string or an integer. + raise - Raises: - ValueError: - If the output string contains dots but it does not match a block - name exactly + def _produce_block(self, block, block_name, context, output_variables, outputs): + """Get the block args from the context and produce the block. - Returns: - tuple: - The output is a tuple containing: - * block_name (str): name of the block from which the output will be - returned, including its counter number. - * variable_name (str): Name of the variable to extract from the context. - It can be ``None``, which means that the whole context is to be - returned. + Afterwards, set the block outputs back into the context and update + the outputs list if necessary. """ - # If None is given, both block and varialbe are None - if output is None: - return None, None - - # If an int is given, it is a block index and there is no variable - if isinstance(output, int): - output = self._get_block_name(output) - return output, None - - # If the string matches a block name, there is no variable - if output in self.blocks: - return output, None - - # If there is at least one dot in the output, but it did not match - # a block name, it is considered to be {block_name}.{variable_name} - if '.' in output: - output_block, output_variable = output.rsplit('.', 1) - if output_block not in self.blocks: - raise ValueError('Unknown block name: {}'.format(output_block)) - - return output_block, output_variable - - # If the given string is not a block name and it has no dots, - # it is considered to be a variable name to be extracted - # from the context after the last block has been produced - last_block_name = self._get_block_name(-1) - return last_block_name, output - - def _get_output(self, output_variable, context): - """Get the specified output variable from the context. - - If the variable name is ``None``, return the entire context. - """ - if output_variable: - if output_variable not in context: - raise ValueError('Output variable {} not found in context' - .format(output_variable)) + LOGGER.debug("Producing block %s", block_name) + try: + produce_args = self._get_block_args(block_name, block.produce_args, context) + block_outputs = block.produce(**produce_args) - return context[output_variable] - else: - return context + outputs_dict = self._extract_outputs(block_name, block_outputs, block.produce_output) + context.update(outputs_dict) + + if output_variables: + self._update_outputs(block_name, output_variables, outputs, outputs_dict) + + except Exception: + if self.verbose: + LOGGER.exception("Exception caught producing MLBlock %s", block_name) + + raise def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): """Fit the blocks of this pipeline. @@ -467,35 +572,13 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): Args: X: Fit Data, which the pipeline will learn from. - y: Fit Data labels, which the pipeline will use to learn how to behave. - output_ (str or int or None): - Output specification, which can be a string or an integer or None. - - * If it is None (default), nothing will be returned - * If an integer is given, it is interpreted as the block number, and the whole - context after running the specified block will be returned. - * If it is a string, it can be interpreted in three ways: - - * **block name**: If the string matches a block name exactly, including - its hash and counter number ``#n`` at the end, the whole context will be - returned after that block is produced. - * **variable_name**: If the string does not match any block name and does - not contain any dot characted, ``'.'``, it will be considered a variable - name. In this case, the indicated variable will be extracted from the - context and returned after the last block has been produced. - * **block_name + variable_name**: If the complete string does not match a - block name but it contains at least one dot, ``'.'``, it will be split - in two parts on the last dot. If the first part of the string matches a - block name exactly, the second part of the string will be considered a - variable name, assuming the format ``{block_name}.{variable_name}``, and - the indicated variable will be extracted from the context and returned - after the block has been produced. Otherwise, if the extracted - ``block_name`` does not match a block name exactly, a ``ValueError`` - will be raised. + output_ (str or int or list or None): + Output specification, as required by ``get_outputs``. If ``None`` is given, + nothing will be returned. start_ (str or int or None): Block index or block name to start processing from. The @@ -510,13 +593,9 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): Returns: None or dict or object: - * If no output is specified, nothing will be returned. - * If an output block has been specified without and output variable, the - context dictionary will be returned after the produce method of that block - has been called. - * If both an output block and an output variable have been specified, - the value of that variable from the context will extracted and returned - after the produce method of that block has been called. + * If no ``output`` is specified, nothing will be returned. + * If ``output_`` has been specified, either a single value or a + tuple of values will be returned. """ context = kwargs.copy() if X is not None: @@ -525,8 +604,14 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): if y is not None: context['y'] = y - output_block, output_variable = self._get_output_spec(output_) - last_block_name = self._get_block_name(-1) + if output_ is not None: + output_variables = self.get_output_variables(output_) + outputs = output_variables.copy() + output_blocks = {variable.rsplit('.', 1)[0] for variable in output_variables} + else: + output_variables = None + outputs = None + output_blocks = set() if isinstance(start_, int): start_ = self._get_block_name(start_) @@ -539,34 +624,28 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): LOGGER.debug("Skipping block %s fit", block_name) continue - LOGGER.debug("Fitting block %s", block_name) - try: - fit_args = self._get_block_args(block_name, block.fit_args, context) - block.fit(**fit_args) - except Exception: - LOGGER.exception("Exception caught fitting MLBlock %s", block_name) - raise + self._fit_block(block, block_name, context) - if (block_name != last_block_name) or (block_name == output_block): - LOGGER.debug("Producing block %s", block_name) - try: - produce_args = self._get_block_args(block_name, block.produce_args, context) - outputs = block.produce(**produce_args) + if (block_name != self._last_block_name) or (block_name in output_blocks): + self._produce_block(block, block_name, context, output_variables, outputs) - output_dict = self._extract_outputs(block_name, outputs, block.produce_output) - context.update(output_dict) - except Exception: - LOGGER.exception("Exception caught producing MLBlock %s", block_name) - raise + # We already captured the output from this block + if block_name in output_blocks: + output_blocks.remove(block_name) - if block_name == output_block: - return self._get_output(output_variable, context) + # If there was an output_ but there are no pending + # outputs we are done. + if output_ is not None and not output_blocks: + if len(outputs) > 1: + return tuple(outputs) + else: + return outputs[0] if start_: # We skipped all the blocks up to the end raise ValueError('Unknown block name: {}'.format(start_)) - def predict(self, X=None, output_=None, start_=None, **kwargs): + def predict(self, X=None, output_='default', start_=None, **kwargs): """Produce predictions using the blocks of this pipeline. Sequentially call the ``produce`` method of each block, capturing the @@ -581,29 +660,9 @@ def predict(self, X=None, output_=None, start_=None, **kwargs): X: Data which the pipeline will use to make predictions. - output_ (str or int or None): - Output specification, which can be a string or an integer or None. - * If it is None (default), the output of the last block will be returned. - * If an integer is given, it is interpreted as the block number, and the whole - context after running the specified block will be returned. - * If it is a string, it can be interpreted in three ways: - - * **block name**: If the string matches a block name exactly, including - its hash and counter number ``#n`` at the end, the whole context will be - returned after that block is produced. - * **variable_name**: If the string does not match any block name and does - not contain any dot characted, ``'.'``, it will be considered a variable - name. In this case, the indicated variable will be extracted from the - context and returned after the last block has been produced. - * **block_name + variable_name**: If the complete string does not match a - block name but it contains at least one dot, ``'.'``, it will be split - in two parts on the last dot. If the first part of the string matches a - block name exactly, the second part of the string will be considered a - variable name, assuming the format ``{block_name}.{variable_name}``, and - the indicated variable will be extracted from the context and returned - after the block has been produced. Otherwise, if the extracted - ``block_name`` does not match a block name exactly, a ``ValueError`` - will be raised. + output_ (str or int or list or None): + Output specification, as required by ``get_outputs``. If not specified + the ``default`` output will be returned. start_ (str or int or None): Block index or block name to start processing from. The @@ -617,20 +676,17 @@ def predict(self, X=None, output_=None, start_=None, **kwargs): to the context dictionary and available for the blocks. Returns: - None or dict or object: - * If no output is specified, the output of the last block will be returned. - * If an output block has been specified without and output variable, the - context dictionary will be returned after the produce method of that block - has been called. - * If both an output block and an output variable have been specified, - the value of that variable from the context will extracted and returned - after the produce method of that block has been called. + object or tuple: + * If a single output is requested, it is returned alone. + * If multiple outputs have been requested, a tuple is returned. """ context = kwargs.copy() if X is not None: context['X'] = X - output_block, output_variable = self._get_output_spec(output_) + output_variables = self.get_output_variables(output_) + outputs = output_variables.copy() + output_blocks = {variable.rsplit('.', 1)[0] for variable in output_variables} if isinstance(start_, int): start_ = self._get_block_name(start_) @@ -643,27 +699,24 @@ def predict(self, X=None, output_=None, start_=None, **kwargs): LOGGER.debug("Skipping block %s produce", block_name) continue - LOGGER.debug("Producing block %s", block_name) - try: - produce_args = self._get_block_args(block_name, block.produce_args, context) - outputs = block.produce(**produce_args) - output_dict = self._extract_outputs(block_name, outputs, block.produce_output) - context.update(output_dict) + self._produce_block(block, block_name, context, output_variables, outputs) - if block_name == output_block: - return self._get_output(output_variable, context) + # We already captured the output from this block + if block_name in output_blocks: + output_blocks.remove(block_name) - except Exception: - LOGGER.exception("Exception caught producing MLBlock %s", block_name) - raise + # If there was an output_ but there are no pending + # outputs we are done. + if not output_blocks: + if len(outputs) > 1: + return tuple(outputs) + else: + return outputs[0] if start_: # We skipped all the blocks up to the end raise ValueError('Unknown block name: {}'.format(start_)) - if output_ is None: - return outputs - def to_dict(self): """Return all the details of this MLPipeline in a dict. @@ -710,7 +763,8 @@ def to_dict(self): 'input_names': self.input_names, 'output_names': self.output_names, 'hyperparameters': self.get_hyperparameters(), - 'tunable_hyperparameters': self._tunable_hyperparameters + 'tunable_hyperparameters': self._tunable_hyperparameters, + 'outputs': self.outputs, } def save(self, path): diff --git a/tests/features/test_partial_outputs.py b/tests/features/test_partial_outputs.py index ce28d457..7098dcd7 100644 --- a/tests/features/test_partial_outputs.py +++ b/tests/features/test_partial_outputs.py @@ -40,7 +40,7 @@ def test_fit_output(self): invalid_int = 10 str_block = 'sklearn.preprocessing.StandardScaler#1' invalid_block = 'InvalidBlockName' - str_block_variable = 'sklearn.preprocessing.StandardScaler#1.y' + str_block_variable = 'sklearn.preprocessing.StandardScaler#1.X' invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid' # Run @@ -58,16 +58,9 @@ def test_fit_output(self): [0.71269665, -0.645124, 0.39067021, 0.31740553], [0.26726124, -0.10752067, 1.36734573, 1.55176035] ]) - y = np.array([1, 0, 0, 1, 2]) - context = { - 'X': X, - 'y': y - } - almost_equal(context, int_out) - almost_equal(context, str_out) - - almost_equal(y, str_out_variable) - + almost_equal(X, int_out) + almost_equal(X, str_out) + almost_equal(X, str_out_variable) assert no_output is None # Run asserting exceptions diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index 327387f5..3f6121ea 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -2,7 +2,7 @@ from collections import OrderedDict from unittest import TestCase -from unittest.mock import Mock, call, patch +from unittest.mock import MagicMock, Mock, call, patch from mlblocks.mlpipeline import MLPipeline @@ -12,7 +12,15 @@ class TestMLPipline(TestCase): @patch('mlblocks.mlpipeline.LOGGER') @patch('mlblocks.mlpipeline.MLBlock') def test___init__(self, mlblock_mock, logger_mock): - blocks = [Mock(), Mock(), Mock(), Mock()] + blocks = [Mock(), Mock(), Mock()] + last_block = Mock() + last_block.produce_output = [ + { + 'name': 'y', + 'type': 'array' + } + ] + blocks.append(last_block) mlblock_mock.side_effect = blocks primitives = [ @@ -61,6 +69,16 @@ def test___init__(self, mlblock_mock, logger_mock): 'another.primitive.Name#1': blocks[2].get_tunable_hyperparameters.return_value, 'another.primitive.Name#2': blocks[3].get_tunable_hyperparameters.return_value } + assert mlpipeline.outputs == { + 'default': [ + { + 'name': 'y', + 'type': 'array', + 'variable': 'another.primitive.Name#2.y' + } + ] + } + assert mlpipeline.verbose expected_calls = [ call('a.primitive.Name', an_argument='value'), @@ -75,8 +93,9 @@ def test___init__(self, mlblock_mock, logger_mock): 'a.primitive.Name' ) + @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) def test_get_tunable_hyperparameters(self): - mlpipeline = MLPipeline(list()) + mlpipeline = MLPipeline(['a_primitive']) tunable = dict() mlpipeline._tunable_hyperparameters = tunable @@ -85,8 +104,9 @@ def test_get_tunable_hyperparameters(self): assert returned == tunable assert returned is not tunable + @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) def test_get_tunable_hyperparameters_flat(self): - mlpipeline = MLPipeline(list()) + mlpipeline = MLPipeline(['a_primitive']) tunable = { 'block_1': { 'hp_1': { @@ -141,6 +161,7 @@ def test_get_tunable_hyperparameters_flat(self): } assert returned == expected + @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) def test_get_hyperparameters(self): block_1 = Mock() block_1.get_hyperparameters.return_value = { @@ -155,7 +176,7 @@ def test_get_hyperparameters(self): ('a.primitive.Name#1', block_1), ('a.primitive.Name#2', block_2), )) - mlpipeline = MLPipeline(list()) + mlpipeline = MLPipeline(['a_primitive']) mlpipeline.blocks = blocks hyperparameters = mlpipeline.get_hyperparameters() @@ -172,6 +193,7 @@ def test_get_hyperparameters(self): block_1.get_hyperparameters.assert_called_once_with() block_2.get_hyperparameters.assert_called_once_with() + @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) def test_get_hyperparameters_flat(self): block_1 = Mock() block_1.get_hyperparameters.return_value = { @@ -186,7 +208,7 @@ def test_get_hyperparameters_flat(self): ('a.primitive.Name#1', block_1), ('a.primitive.Name#2', block_2), )) - mlpipeline = MLPipeline(list()) + mlpipeline = MLPipeline(['a_primitive']) mlpipeline.blocks = blocks hyperparameters = mlpipeline.get_hyperparameters(flat=True) @@ -199,6 +221,7 @@ def test_get_hyperparameters_flat(self): block_1.get_hyperparameters.assert_called_once_with() block_2.get_hyperparameters.assert_called_once_with() + @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) def test_set_hyperparameters(self): block_1 = Mock() block_2 = Mock() @@ -206,7 +229,7 @@ def test_set_hyperparameters(self): ('a.primitive.Name#1', block_1), ('a.primitive.Name#2', block_2), )) - mlpipeline = MLPipeline(list()) + mlpipeline = MLPipeline(['a_primitive']) mlpipeline.blocks = blocks hyperparameters = { @@ -219,6 +242,7 @@ def test_set_hyperparameters(self): block_1.set_hyperparameters.assert_not_called() block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'}) + @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) def test_set_hyperparameters_flat(self): block_1 = Mock() block_2 = Mock() @@ -226,7 +250,7 @@ def test_set_hyperparameters_flat(self): ('a.primitive.Name#1', block_1), ('a.primitive.Name#2', block_2), )) - mlpipeline = MLPipeline(list()) + mlpipeline = MLPipeline(['a_primitive']) mlpipeline.blocks = blocks hyperparameters = { @@ -237,13 +261,14 @@ def test_set_hyperparameters_flat(self): block_1.set_hyperparameters.assert_not_called() block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'}) + @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) def test__get_block_args(self): input_names = { 'a_block': { 'arg_3': 'arg_3_alt' } } - pipeline = MLPipeline(list(), input_names=input_names) + pipeline = MLPipeline(['a_primitive'], input_names=input_names) block_args = [ { From dabf1a13dfa05f73e70e9b4578b08bca7ace7edc Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 3 Sep 2019 22:16:10 +0200 Subject: [PATCH 059/160] Add unit tests --- mlblocks/mlpipeline.py | 4 +- tests/features/test_partial_outputs.py | 57 ++++++--- tests/test_mlpipeline.py | 164 ++++++++++++++++++++++++- 3 files changed, 201 insertions(+), 24 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index b02561fe..9de286cb 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -209,7 +209,9 @@ def _get_str_output(self, output): for variable in block.produce_output: if variable['name'] == variable_name: - return [{'name': variable_name, 'variable': output}] + output_variable = deepcopy(variable) + output_variable['variable'] = output + return [output_variable] raise ValueError('Block {} has no output {}'.format(block_name, variable_name)) diff --git a/tests/features/test_partial_outputs.py b/tests/features/test_partial_outputs.py index 7098dcd7..d31d2dd8 100644 --- a/tests/features/test_partial_outputs.py +++ b/tests/features/test_partial_outputs.py @@ -3,7 +3,6 @@ import numpy as np -from mlblocks.datasets import load_iris from mlblocks.mlpipeline import MLPipeline @@ -15,6 +14,7 @@ def almost_equal(obj1, obj2): for key, value in obj1.items(): if key not in obj2: raise AssertionError("{} not in {}".format(key, obj2)) + almost_equal(value, obj2[key]) else: @@ -23,9 +23,14 @@ def almost_equal(obj1, obj2): class TestPartialOutputs(TestCase): def setUp(self): - dataset = load_iris() - - self.X_train, self.X_test, self.y_train, self.y_test = dataset.get_splits(1) + self.X = np.array([ + [1, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 0, 1, 0, 0], + [0, 0, 0, 1, 0], + [0, 0, 0, 0, 1], + ]) + self.y = np.array([0, 0, 0, 0, 1]) def test_fit_output(self): @@ -36,6 +41,8 @@ def test_fit_output(self): ] pipeline = MLPipeline(primitives) + named = 'default' + list_ = ['default', 0] int_block = 0 invalid_int = 10 str_block = 'sklearn.preprocessing.StandardScaler#1' @@ -44,20 +51,30 @@ def test_fit_output(self): invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid' # Run - int_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=int_block) - str_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=str_block) - str_out_variable = pipeline.fit(self.X_train[0:5], self.y_train[0:5], + named_out = pipeline.fit(self.X, self.y, output_=named) + list_out = pipeline.fit(self.X, self.y, output_=list_) + int_out = pipeline.fit(self.X, self.y, output_=int_block) + str_out = pipeline.fit(self.X, self.y, output_=str_block) + str_out_variable = pipeline.fit(self.X, self.y, output_=str_block_variable) - no_output = pipeline.fit(self.X_train, self.y_train) + no_output = pipeline.fit(self.X, self.y) # Assert successful calls X = np.array([ - [0.71269665, -1.45152899, 0.55344946, 0.31740553], - [0.26726124, 1.23648766, -1.1557327, -1.0932857], - [-1.95991577, 0.967686, -1.1557327, -1.0932857], - [0.71269665, -0.645124, 0.39067021, 0.31740553], - [0.26726124, -0.10752067, 1.36734573, 1.55176035] + [2., -0.5, -0.5, -0.5, -0.5], + [-0.5, 2., -0.5, -0.5, -0.5], + [-0.5, -0.5, 2., -0.5, -0.5], + [-0.5, -0.5, -0.5, 2., -0.5], + [-0.5, -0.5, -0.5, -0.5, 2.], ]) + y = np.array([ + 0, 0, 0, 0, 1 + ]) + + almost_equal(named_out, y) + assert len(list_out) == 2 + almost_equal(list_out[0], y) + almost_equal(list_out[1], X) almost_equal(X, int_out) almost_equal(X, str_out) almost_equal(X, str_out_variable) @@ -65,13 +82,13 @@ def test_fit_output(self): # Run asserting exceptions with self.assertRaises(IndexError): - pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_int) + pipeline.fit(self.X, self.y, output_=invalid_int) with self.assertRaises(ValueError): - pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_block) + pipeline.fit(self.X, self.y, output_=invalid_block) with self.assertRaises(ValueError): - pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_variable) + pipeline.fit(self.X, self.y, output_=invalid_variable) def test_fit_start(self): # Setup variables @@ -87,8 +104,8 @@ def test_fit_start(self): # Run first block context = { - 'X': self.X_train, - 'y': self.y_train + 'X': self.X, + 'y': self.y } int_start = 1 str_start = 'sklearn.linear_model.LogisticRegression#1' @@ -106,7 +123,7 @@ def test_predict_start(self): 'sklearn.linear_model.LogisticRegression' ] pipeline = MLPipeline(primitives) - pipeline.fit(self.X_train, self.y_train) + pipeline.fit(self.X, self.y) # Mock the first block block_mock = Mock() @@ -114,7 +131,7 @@ def test_predict_start(self): # Run first block context = { - 'X': self.X_train, + 'X': self.X, } int_start = 1 str_start = 'sklearn.linear_model.LogisticRegression#1' diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index 3f6121ea..7062e38e 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -107,7 +107,7 @@ def test_get_tunable_hyperparameters(self): @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) def test_get_tunable_hyperparameters_flat(self): mlpipeline = MLPipeline(['a_primitive']) - tunable = { + mlpipeline._tunable_hyperparameters = { 'block_1': { 'hp_1': { 'type': 'int', @@ -133,7 +133,6 @@ def test_get_tunable_hyperparameters_flat(self): } } } - mlpipeline._tunable_hyperparameters = tunable returned = mlpipeline.get_tunable_hyperparameters(flat=True) @@ -299,9 +298,168 @@ def test__get_block_args(self): } assert args == expected - def test__get_outputs(self): + @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + def test__get_outputs_no_outputs(self): + self_ = Mock() + self_._last_block_name = 'last_block' + self_._get_block_outputs.return_value = ['some', 'outputs'] + + pipeline = dict() + outputs = None + returned = MLPipeline._get_outputs(self_, pipeline, outputs) + + expected = { + 'default': ['some', 'outputs'] + } + assert returned == expected + + self_._get_block_outputs.assert_called_once_with('last_block') + + @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + def test__get_outputs_defaults(self): + self_ = Mock() + + pipeline = dict() + outputs = { + 'default': ['some', 'outputs'] + } + returned = MLPipeline._get_outputs(self_, pipeline, outputs) + + expected = { + 'default': ['some', 'outputs'] + } + assert returned == expected + self_._get_block_outputs.assert_not_called() + + @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + def test__get_outputs_additional(self): + self_ = Mock() + + pipeline = { + 'outputs': { + 'default': ['some', 'outputs'], + 'additional': ['other', 'outputs'] + } + } + outputs = None + returned = MLPipeline._get_outputs(self_, pipeline, outputs) + + expected = { + 'default': ['some', 'outputs'], + 'additional': ['other', 'outputs'] + } + assert returned == expected + self_._get_block_outputs.assert_not_called() + + def test_get_outputs_str(self): + pass + + def test_get_outputs_int(self): + pass + + def test_get_outputs_list_of_str(self): + pass + + def test_get_outputs_list_of_int(self): pass + def test_get_outputs_named_outputs(self): + pass + + def test_get_outputs_combination(self): + pass + + @patch('mlblocks.mlpipeline.MLBlock') + def test_get_outputs_invalid(self, mlblock_mock): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_variable', + 'type': 'a_type', + } + ], + 'debug': [ + { + 'name': 'another_name', + 'variable': 'another_variable', + } + ] + } + mlblock_mock.side_effect = [MagicMock(), MagicMock()] + pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs) + + pipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'output', + 'type': 'whatever' + } + ] + pipeline.blocks['another_primitive#1'].produce_output = [ + { + 'name': 'something', + } + ] + + returned = pipeline.get_outputs(['default', 'debug', -1, 'a_primitive#1.output']) + + expected = [ + { + 'name': 'a_name', + 'variable': 'a_variable', + 'type': 'a_type' + }, + { + 'name': 'another_name', + 'variable': 'another_variable', + }, + { + 'name': 'something', + 'variable': 'another_primitive#1.something', + }, + { + 'name': 'output', + 'type': 'whatever', + 'variable': 'a_primitive#1.output' + } + ] + + assert returned == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + def test_get_output_names(self): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_variable', + 'type': 'a_type', + } + ] + } + pipeline = MLPipeline(['a_primitive'], outputs=outputs) + + names = pipeline.get_output_names() + + assert names == ['a_name'] + + @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + def test_get_output_variables(self): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_variable', + 'type': 'a_type', + } + ] + } + pipeline = MLPipeline(['a_primitive'], outputs=outputs) + + names = pipeline.get_output_variables() + + assert names == ['a_variable'] + def test_fit(self): pass From 1d74b20a4ded2d95b46067a6f280b989d16312ef Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 9 Sep 2019 11:29:20 +0200 Subject: [PATCH 060/160] Release notes for v0.3.3 --- HISTORY.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index c3b00ce0..f3dc0a32 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,11 @@ Changelog ========= +0.3.3 - 2019-09-09 +------------------ + +* Improved intermediate outputs management - [Issue #105](https://github.com/HDI-Project/MLBlocks/issues/105) by @csala + 0.3.2 - 2019-08-12 ------------------ From 3b06ab885dce7fc601468dc6ee8f1b901bfba8ff Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 9 Sep 2019 11:29:24 +0200 Subject: [PATCH 061/160] =?UTF-8?q?Bump=20version:=200.3.3-dev=20=E2=86=92?= =?UTF-8?q?=200.3.3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 7f6e1eaf..b85b1de0 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.3.3-dev' +__version__ = '0.3.3' __all__ = [ 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index a9051663..0fa10faa 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.3-dev +current_version = 0.3.3 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? diff --git a/setup.py b/setup.py index 870d1276..4104d912 100644 --- a/setup.py +++ b/setup.py @@ -100,6 +100,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/MLBlocks', - version='0.3.3-dev', + version='0.3.3', zip_safe=False, ) From 0dcb324c1d7e09e0a04d61dd400105afa7d6c8a5 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 9 Sep 2019 11:29:43 +0200 Subject: [PATCH 062/160] =?UTF-8?q?Bump=20version:=200.3.3=20=E2=86=92=200?= =?UTF-8?q?.3.4-dev?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index b85b1de0..8c30609e 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.3.3' +__version__ = '0.3.4-dev' __all__ = [ 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index 0fa10faa..de7507c0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.3 +current_version = 0.3.4-dev commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? diff --git a/setup.py b/setup.py index 4104d912..421dbbd6 100644 --- a/setup.py +++ b/setup.py @@ -100,6 +100,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/MLBlocks', - version='0.3.3', + version='0.3.4-dev', zip_safe=False, ) From eb78b55e5466b918ed1be8f0a699538b69d26773 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 27 Sep 2019 16:13:45 +0200 Subject: [PATCH 063/160] support importing class methods --- mlblocks/mlblock.py | 12 ++++++++++-- tests/test_mlblock.py | 35 ++++++++++++++++++++++++++++++----- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index db24caa5..f570165b 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -13,9 +13,17 @@ def import_object(object_name): """Import an object from its Fully Qualified Name.""" + if isinstance(object_name, str): - package, name = object_name.rsplit('.', 1) - return getattr(importlib.import_module(package), name) + parent_name, attribute = object_name.rsplit('.', 1) + try: + parent = importlib.import_module(parent_name) + except ImportError: + grand_parent_name, parent_name = parent_name.rsplit('.', 1) + grand_parent = importlib.import_module(grand_parent_name) + parent = getattr(grand_parent, parent_name) + + return getattr(parent, attribute) return object_name diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py index b4dbc637..355015d0 100644 --- a/tests/test_mlblock.py +++ b/tests/test_mlblock.py @@ -3,19 +3,44 @@ from unittest import TestCase from unittest.mock import MagicMock, Mock, patch -from mlblocks.mlblock import MLBlock, import_object +import pytest -# import pytest +from mlblocks.mlblock import MLBlock, import_object class DummyClass: + def a_method(self): + pass + + +def dummy_function(): pass -def test_import_object(): - dummy_class = import_object(__name__ + '.DummyClass') +class TestImportObject(TestCase): + + def test_class(self): + imported = import_object(__name__ + '.DummyClass') + + assert imported is DummyClass + + def test_class_method(self): + imported = import_object(__name__ + '.DummyClass.a_method') + + assert imported is DummyClass.a_method + + def test_function(self): + imported = import_object(__name__ + '.dummy_function') + + assert imported is dummy_function + + def test_bad_object_name(self): + with pytest.raises(AttributeError): + import_object(__name__ + '.InvalidName') - assert dummy_class is DummyClass + def test_bad_module(self): + with pytest.raises(ModuleNotFoundError): + import_object('an.invalid.module') class TestMLBlock(TestCase): From 8d53d2a94fcc4323dcc0faf37bc38535b8198fa7 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 27 Sep 2019 16:17:25 +0200 Subject: [PATCH 064/160] Add configuration to upload release candidates to PyPI --- Makefile | 26 +++++++++++++++++++------- mlblocks/__init__.py | 2 +- setup.cfg | 15 +++++++++++---- setup.py | 2 +- tox.ini | 2 +- 5 files changed, 33 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index e54e1362..6e8dd203 100644 --- a/Makefile +++ b/Makefile @@ -155,7 +155,7 @@ publish: dist ## package and upload a release .PHONY: bumpversion-release bumpversion-release: ## Merge master to stable and bumpversion release - git checkout stable + git checkout stable || git checkout -b stable git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable" bumpversion release git push --tags origin stable @@ -167,6 +167,10 @@ bumpversion-patch: ## Merge stable to master and bumpversion patch bumpversion --no-tag patch git push +.PHONY: bumpversion-candidate +bumpversion-candidate: ## Bump the version to the next candidate + bumpversion candidate --no-tag + .PHONY: bumpversion-minor bumpversion-minor: ## Bump the version the next minor skipping the release bumpversion --no-tag minor @@ -175,23 +179,31 @@ bumpversion-minor: ## Bump the version the next minor skipping the release bumpversion-major: ## Bump the version the next major skipping the release bumpversion --no-tag major -CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD) -CHANGELOG_LINES := $(shell git diff HEAD..stable HISTORY.md | wc -l) +CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) +CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l) -.PHONY: check-release -check-release: ## Check if the release can be made +.PHONY: check-master +check-master: ## Check if we are in master branch ifneq ($(CURRENT_BRANCH),master) $(error Please make the release from master branch\n) endif + +.PHONY: check-history +check-history: ## Check if HISTORY.md has been modified ifeq ($(CHANGELOG_LINES),0) $(error Please insert the release notes in HISTORY.md before releasing) -else - @echo "A new release can be made" endif +.PHONY: check-release +check-release: check-master check-history ## Check if the release can be made + @echo "A new release can be made" + .PHONY: release release: check-release bumpversion-release publish bumpversion-patch +.PHONY: release-candidate +release-candidate: check-master publish bumpversion-candidate + .PHONY: release-minor release-minor: check-release bumpversion-minor release diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 8c30609e..3ede651e 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.3.4-dev' +__version__ = '0.3.4.dev0' __all__ = [ 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index de7507c0..563c9c5c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,18 +1,21 @@ [bumpversion] -current_version = 0.3.4-dev +current_version = 0.3.4.dev0 commit = True tag = True -parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? serialize = - {major}.{minor}.{patch}-{release} + {major}.{minor}.{patch}-{release}{candidate} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = release +first_value = dev values = dev release +[bumpversion:part:candidate] + [bumpversion:file:setup.py] search = version='{current_version}' replace = version='{new_version}' @@ -34,8 +37,12 @@ include_trailing_comment = True line_length = 99 lines_between_types = 0 multi_line_output = 4 -not_skip = __init__.py use_parentheses = True +not_skip = __init__.py +skip_glob = *.bak + +[metadata] +description-file = README.md [aliases] test = pytest diff --git a/setup.py b/setup.py index 421dbbd6..abc43800 100644 --- a/setup.py +++ b/setup.py @@ -100,6 +100,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/MLBlocks', - version='0.3.4-dev', + version='0.3.4.dev0', zip_safe=False, ) diff --git a/tox.ini b/tox.ini index 76529366..666eeab0 100644 --- a/tox.ini +++ b/tox.ini @@ -14,7 +14,7 @@ setenv = PYTHONPATH = {toxinidir} extras = test commands = - /usr/bin/env python -m pytest --cov=mlblocks + /usr/bin/env make test [testenv:lint] From 001561a169229ec652c08e4ace32dc3023d4bbd4 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 27 Sep 2019 16:20:05 +0200 Subject: [PATCH 065/160] Add release-candidate documentation --- CONTRIBUTING.rst | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 4fce53bf..4c01093e 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -195,3 +195,33 @@ Once this is done, run of the following commands: 3. If you are releasing a major version:: make release-major + +Release Candidates +~~~~~~~~~~~~~~~~~~ + +Sometimes it is necessary or convenient to upload a release candidate to PyPi as a pre-release, +in order to make some of the new features available for testing on other projects before they +are included in an actual full-blown release. + +In order to perform such an action, you can execute:: + + make release-candidate + +This will perform the following actions: + +1. Build and upload the current version to PyPi as a pre-release, with the format ``X.Y.Z.devN`` + +2. Bump the current version to the next release candidate, ``X.Y.Z.dev(N+1)`` + +After this is done, the new pre-release can be installed by including the ``dev`` section in the +dependency specification, either in ``setup.py``:: + + install_requires = [ + ... + 'mlblocks>=X.Y.Z.dev', + ... + ] + +or in command line:: + + pip install 'mlblocks>=X.Y.Z.dev' From 45f9ae2ae6b50a4a6ae1e50f326c130f5a571d69 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 27 Sep 2019 16:25:45 +0200 Subject: [PATCH 066/160] Fix error in python3.5 due to an inexisting Exception type --- tests/test_mlblock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py index 355015d0..93adb0dd 100644 --- a/tests/test_mlblock.py +++ b/tests/test_mlblock.py @@ -39,7 +39,7 @@ def test_bad_object_name(self): import_object(__name__ + '.InvalidName') def test_bad_module(self): - with pytest.raises(ModuleNotFoundError): + with pytest.raises(ImportError): import_object('an.invalid.module') From 09aa6e9466956d3883895b573d5ea03ad257b501 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 3 Oct 2019 20:20:52 +0200 Subject: [PATCH 067/160] Fix release-candidate version format --- setup.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 563c9c5c..a122a298 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,9 +2,9 @@ current_version = 0.3.4.dev0 commit = True tag = True -parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? serialize = - {major}.{minor}.{patch}-{release}{candidate} + {major}.{minor}.{patch}.{release}{candidate} {major}.{minor}.{patch} [bumpversion:part:release] From 2b5d7900a22ce72b7b59ef85a0f024ffec0a0c0d Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 3 Oct 2019 20:21:28 +0200 Subject: [PATCH 068/160] =?UTF-8?q?Bump=20version:=200.3.4.dev0=20?= =?UTF-8?q?=E2=86=92=200.3.4.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 3ede651e..81b45593 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.3.4.dev0' +__version__ = '0.3.4.dev1' __all__ = [ 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index a122a298..0c2ea21a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.4.dev0 +current_version = 0.3.4.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index abc43800..da4bb6f3 100644 --- a/setup.py +++ b/setup.py @@ -100,6 +100,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/MLBlocks', - version='0.3.4.dev0', + version='0.3.4.dev1', zip_safe=False, ) From d790938e8cee8528fe90725eb145fde3c6bd99e2 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 30 Oct 2019 13:19:30 -0400 Subject: [PATCH 069/160] New partial output with context - WIP --- mlblocks/mlpipeline.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 9de286cb..d0d67f8f 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -145,8 +145,11 @@ def _get_block_outputs(self, block_name): """Get the list of output variables for the given block.""" block = self.blocks[block_name] outputs = deepcopy(block.produce_output) + output_names = self.output_names.get(block_name, dict()) for output in outputs: - output['variable'] = '{}.{}'.format(block_name, output['name']) + name = output['name'] + context_name = output_names.get(name, name) + output['variable'] = '{}.{}'.format(block_name, context_name) return outputs @@ -606,7 +609,7 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): if y is not None: context['y'] = y - if output_ is not None: + if isinstance(output_, str): output_variables = self.get_output_variables(output_) outputs = output_variables.copy() output_blocks = {variable.rsplit('.', 1)[0] for variable in output_variables} @@ -615,6 +618,9 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): outputs = None output_blocks = set() + if isinstance(output_, int): + output_ = self._get_block_name(output_) + if isinstance(start_, int): start_ = self._get_block_name(start_) @@ -628,16 +634,19 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): self._fit_block(block, block_name, context) - if (block_name != self._last_block_name) or (block_name in output_blocks): + last_block = block_name != self._last_block_name + if last_block or (block_name == output_) or (block_name in output_blocks): self._produce_block(block, block_name, context, output_variables, outputs) # We already captured the output from this block if block_name in output_blocks: output_blocks.remove(block_name) + elif block_name == output_: + return context # If there was an output_ but there are no pending # outputs we are done. - if output_ is not None and not output_blocks: + if output_variables is not None and not output_blocks: if len(outputs) > 1: return tuple(outputs) else: From 1a0eb099d177753d07797757bf0b3ae9a20de1f2 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 31 Oct 2019 12:59:46 -0400 Subject: [PATCH 070/160] Allow getting full context in partial outputs --- mlblocks/mlpipeline.py | 61 +++++---- tests/features/test_partial_outputs.py | 7 +- tests/test_mlpipeline.py | 175 +++++++++++++++++++------ 3 files changed, 171 insertions(+), 72 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index d0d67f8f..21aa7ecc 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -4,6 +4,7 @@ import json import logging +import re from collections import Counter, OrderedDict, defaultdict from copy import deepcopy @@ -198,12 +199,15 @@ def __init__(self, pipeline=None, primitives=None, init_params=None, if hyperparameters: self.set_hyperparameters(hyperparameters) + self._re_block_name = re.compile(r'(^[^#]+#\d+)(\..*)?') + def _get_str_output(self, output): """Get the outputs that correspond to the str specification.""" if output in self.outputs: return self.outputs[output] elif output in self.blocks: - return self._get_block_outputs(output) + return [{'name': output, 'variable': output}] + # return self._get_block_outputs(output) elif '.' in output: block_name, variable_name = output.rsplit('.', 1) block = self.blocks.get(block_name) @@ -260,11 +264,11 @@ def get_outputs(self, outputs='default'): computed = list() for output in outputs: + if isinstance(output, int): + output = self._get_block_name(output) + if isinstance(output, str): computed.extend(self._get_str_output(output)) - elif isinstance(output, int): - block_name = self._get_block_name(output) - computed.extend(self._get_block_outputs(block_name)) else: raise TypeError('Output Specification can only be str or int') @@ -316,6 +320,18 @@ def get_output_variables(self, outputs='default'): outputs = self.get_outputs(outputs) return [output['variable'] for output in outputs] + def _extract_block_name(self, variable_name): + return self._re_block_name.search(variable_name).group(1) + + def _prepare_outputs(self, outputs): + output_variables = self.get_output_variables(outputs) + outputs = output_variables.copy() + output_blocks = { + self._extract_block_name(variable) + for variable in output_variables + } + return output_variables, outputs, output_blocks + @staticmethod def _flatten_dict(hyperparameters): return { @@ -519,13 +535,11 @@ def _extract_outputs(self, block_name, outputs, block_outputs): return output_dict - def _update_outputs(self, block_name, output_variables, outputs, outputs_dict): + def _update_outputs(self, variable_name, output_variables, outputs, value): """Set the requested block outputs into the outputs list in the right place.""" - for key, value in outputs_dict.items(): - variable_name = '{}.{}'.format(block_name, key) - if variable_name in output_variables: - index = output_variables.index(variable_name) - outputs[index] = deepcopy(value) + if variable_name in output_variables: + index = output_variables.index(variable_name) + outputs[index] = deepcopy(value) def _fit_block(self, block, block_name, context): """Get the block args from the context and fit the block.""" @@ -554,7 +568,12 @@ def _produce_block(self, block, block_name, context, output_variables, outputs): context.update(outputs_dict) if output_variables: - self._update_outputs(block_name, output_variables, outputs, outputs_dict) + if block_name in output_variables: + self._update_outputs(block_name, output_variables, outputs, context) + else: + for key, value in outputs_dict.items(): + variable_name = '{}.{}'.format(block_name, key) + self._update_outputs(variable_name, output_variables, outputs, value) except Exception: if self.verbose: @@ -609,17 +628,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): if y is not None: context['y'] = y - if isinstance(output_, str): - output_variables = self.get_output_variables(output_) - outputs = output_variables.copy() - output_blocks = {variable.rsplit('.', 1)[0] for variable in output_variables} - else: + if output_ is None: output_variables = None outputs = None output_blocks = set() - - if isinstance(output_, int): - output_ = self._get_block_name(output_) + else: + output_variables, outputs, output_blocks = self._prepare_outputs(output_) if isinstance(start_, int): start_ = self._get_block_name(start_) @@ -634,15 +648,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): self._fit_block(block, block_name, context) - last_block = block_name != self._last_block_name - if last_block or (block_name == output_) or (block_name in output_blocks): + if (block_name != self._last_block_name) or (block_name in output_blocks): self._produce_block(block, block_name, context, output_variables, outputs) # We already captured the output from this block if block_name in output_blocks: output_blocks.remove(block_name) - elif block_name == output_: - return context # If there was an output_ but there are no pending # outputs we are done. @@ -695,9 +706,7 @@ def predict(self, X=None, output_='default', start_=None, **kwargs): if X is not None: context['X'] = X - output_variables = self.get_output_variables(output_) - outputs = output_variables.copy() - output_blocks = {variable.rsplit('.', 1)[0] for variable in output_variables} + output_variables, outputs, output_blocks = self._prepare_outputs(output_) if isinstance(start_, int): start_ = self._get_block_name(start_) diff --git a/tests/features/test_partial_outputs.py b/tests/features/test_partial_outputs.py index d31d2dd8..50739cea 100644 --- a/tests/features/test_partial_outputs.py +++ b/tests/features/test_partial_outputs.py @@ -70,13 +70,14 @@ def test_fit_output(self): y = np.array([ 0, 0, 0, 0, 1 ]) + context = {'X': X, 'y': y} almost_equal(named_out, y) assert len(list_out) == 2 almost_equal(list_out[0], y) - almost_equal(list_out[1], X) - almost_equal(X, int_out) - almost_equal(X, str_out) + almost_equal(list_out[1], context) + almost_equal(context, int_out) + almost_equal(context, str_out) almost_equal(X, str_out_variable) assert no_output is None diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index 7062e38e..f2edc36f 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -2,25 +2,36 @@ from collections import OrderedDict from unittest import TestCase -from unittest.mock import MagicMock, Mock, call, patch +from unittest.mock import MagicMock, call, patch +import pytest + +from mlblocks.mlblock import MLBlock from mlblocks.mlpipeline import MLPipeline +def get_mlblock_mock(*args, **kwargs): + return MagicMock(autospec=MLBlock) + + class TestMLPipline(TestCase): @patch('mlblocks.mlpipeline.LOGGER') @patch('mlblocks.mlpipeline.MLBlock') def test___init__(self, mlblock_mock, logger_mock): - blocks = [Mock(), Mock(), Mock()] - last_block = Mock() + blocks = [ + get_mlblock_mock(), + get_mlblock_mock(), + get_mlblock_mock(), + get_mlblock_mock() + ] + last_block = blocks[-1] last_block.produce_output = [ { 'name': 'y', 'type': 'array' } ] - blocks.append(last_block) mlblock_mock.side_effect = blocks primitives = [ @@ -93,7 +104,7 @@ def test___init__(self, mlblock_mock, logger_mock): 'a.primitive.Name' ) - @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_tunable_hyperparameters(self): mlpipeline = MLPipeline(['a_primitive']) tunable = dict() @@ -104,7 +115,7 @@ def test_get_tunable_hyperparameters(self): assert returned == tunable assert returned is not tunable - @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_tunable_hyperparameters_flat(self): mlpipeline = MLPipeline(['a_primitive']) mlpipeline._tunable_hyperparameters = { @@ -160,13 +171,13 @@ def test_get_tunable_hyperparameters_flat(self): } assert returned == expected - @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_hyperparameters(self): - block_1 = Mock() + block_1 = get_mlblock_mock() block_1.get_hyperparameters.return_value = { 'a': 'a' } - block_2 = Mock() + block_2 = get_mlblock_mock() block_2.get_hyperparameters.return_value = { 'b': 'b', 'c': 'c', @@ -192,13 +203,13 @@ def test_get_hyperparameters(self): block_1.get_hyperparameters.assert_called_once_with() block_2.get_hyperparameters.assert_called_once_with() - @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_hyperparameters_flat(self): - block_1 = Mock() + block_1 = get_mlblock_mock() block_1.get_hyperparameters.return_value = { 'a': 'a' } - block_2 = Mock() + block_2 = get_mlblock_mock() block_2.get_hyperparameters.return_value = { 'b': 'b', 'c': 'c', @@ -220,10 +231,10 @@ def test_get_hyperparameters_flat(self): block_1.get_hyperparameters.assert_called_once_with() block_2.get_hyperparameters.assert_called_once_with() - @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_set_hyperparameters(self): - block_1 = Mock() - block_2 = Mock() + block_1 = get_mlblock_mock() + block_2 = get_mlblock_mock() blocks = OrderedDict(( ('a.primitive.Name#1', block_1), ('a.primitive.Name#2', block_2), @@ -241,10 +252,10 @@ def test_set_hyperparameters(self): block_1.set_hyperparameters.assert_not_called() block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'}) - @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_set_hyperparameters_flat(self): - block_1 = Mock() - block_2 = Mock() + block_1 = get_mlblock_mock() + block_2 = get_mlblock_mock() blocks = OrderedDict(( ('a.primitive.Name#1', block_1), ('a.primitive.Name#2', block_2), @@ -260,7 +271,7 @@ def test_set_hyperparameters_flat(self): block_1.set_hyperparameters.assert_not_called() block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'}) - @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test__get_block_args(self): input_names = { 'a_block': { @@ -298,9 +309,10 @@ def test__get_block_args(self): } assert args == expected - @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test__get_outputs_no_outputs(self): - self_ = Mock() + self_ = MagicMock(autospec=MLPipeline) + self_._last_block_name = 'last_block' self_._get_block_outputs.return_value = ['some', 'outputs'] @@ -315,9 +327,9 @@ def test__get_outputs_no_outputs(self): self_._get_block_outputs.assert_called_once_with('last_block') - @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test__get_outputs_defaults(self): - self_ = Mock() + self_ = MagicMock(autospec=MLPipeline) pipeline = dict() outputs = { @@ -331,9 +343,9 @@ def test__get_outputs_defaults(self): assert returned == expected self_._get_block_outputs.assert_not_called() - @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test__get_outputs_additional(self): - self_ = Mock() + self_ = MagicMock(autospec=MLPipeline) pipeline = { 'outputs': { @@ -351,26 +363,90 @@ def test__get_outputs_additional(self): assert returned == expected self_._get_block_outputs.assert_not_called() - def test_get_outputs_str(self): - pass + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_outputs_str_named(self): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_variable', + 'type': 'a_type', + } + ], + 'debug': [ + { + 'name': 'another_name', + 'variable': 'another_variable', + } + ] + } + pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs) + returned = pipeline.get_outputs('debug') + + expected = [ + { + 'name': 'another_name', + 'variable': 'another_variable', + } + ] + + assert returned == expected + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_outputs_str_variable(self): + pipeline = MLPipeline(['a_primitive', 'another_primitive']) + + pipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'output', + 'type': 'whatever' + } + ] + + returned = pipeline.get_outputs('a_primitive#1.output') + + expected = [ + { + 'name': 'output', + 'type': 'whatever', + 'variable': 'a_primitive#1.output' + } + ] + + assert returned == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_outputs_str_block(self): + pipeline = MLPipeline(['a_primitive', 'another_primitive']) + + returned = pipeline.get_outputs('a_primitive#1') + + expected = [ + { + 'name': 'a_primitive#1', + 'variable': 'a_primitive#1', + } + ] + + assert returned == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_outputs_int(self): - pass + pipeline = MLPipeline(['a_primitive', 'another_primitive']) - def test_get_outputs_list_of_str(self): - pass + returned = pipeline.get_outputs(-1) - def test_get_outputs_list_of_int(self): - pass + expected = [ + { + 'name': 'another_primitive#1', + 'variable': 'another_primitive#1', + } + ] - def test_get_outputs_named_outputs(self): - pass + assert returned == expected + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_outputs_combination(self): - pass - - @patch('mlblocks.mlpipeline.MLBlock') - def test_get_outputs_invalid(self, mlblock_mock): outputs = { 'default': [ { @@ -386,7 +462,6 @@ def test_get_outputs_invalid(self, mlblock_mock): } ] } - mlblock_mock.side_effect = [MagicMock(), MagicMock()] pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs) pipeline.blocks['a_primitive#1'].produce_output = [ @@ -414,8 +489,8 @@ def test_get_outputs_invalid(self, mlblock_mock): 'variable': 'another_variable', }, { - 'name': 'something', - 'variable': 'another_primitive#1.something', + 'name': 'another_primitive#1', + 'variable': 'another_primitive#1', }, { 'name': 'output', @@ -426,7 +501,21 @@ def test_get_outputs_invalid(self, mlblock_mock): assert returned == expected - @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_outputs_invalid(self): + pipeline = MLPipeline(['a_primitive']) + + pipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'output', + 'type': 'whatever' + } + ] + + with pytest.raises(ValueError): + pipeline.get_outputs('a_primitive#1.invalid') + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_output_names(self): outputs = { 'default': [ @@ -443,7 +532,7 @@ def test_get_output_names(self): assert names == ['a_name'] - @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock()) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_output_variables(self): outputs = { 'default': [ From 6019adfeff7f167dcea2d7ec2ffc9a7864c16fee Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 31 Oct 2019 15:26:35 -0400 Subject: [PATCH 071/160] =?UTF-8?q?Bump=20version:=200.3.4.dev1=20?= =?UTF-8?q?=E2=86=92=200.3.4.dev2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 81b45593..936c210f 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.3.4.dev1' +__version__ = '0.3.4.dev2' __all__ = [ 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index 0c2ea21a..58f63f5c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.4.dev1 +current_version = 0.3.4.dev2 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index da4bb6f3..60c97534 100644 --- a/setup.py +++ b/setup.py @@ -100,6 +100,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/MLBlocks', - version='0.3.4.dev1', + version='0.3.4.dev2', zip_safe=False, ) From b7baf968b4be9d1b59384b13c9c55d5d5da3299e Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 4 Nov 2019 10:05:26 -0500 Subject: [PATCH 072/160] Release notes for v0.3.4 --- HISTORY.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index f3dc0a32..5b5d4f0b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,12 @@ Changelog ========= +0.3.4 - 2019-11-01 +------------------ + +* Ability to return intermediate context - [Issue #110](https://github.com/HDI-Project/MLBlocks/issues/110) by @csala +* Support for static or class methods - [Issue #107](https://github.com/HDI-Project/MLBlocks/issues/107) by @csala + 0.3.3 - 2019-09-09 ------------------ From b0cd3808f3291d9bd043362ff2e827ac626f8ef9 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 4 Nov 2019 10:05:27 -0500 Subject: [PATCH 073/160] =?UTF-8?q?Bump=20version:=200.3.4.dev2=20?= =?UTF-8?q?=E2=86=92=200.3.4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 936c210f..e4aa9838 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.3.4.dev2' +__version__ = '0.3.4' __all__ = [ 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index 58f63f5c..709511b4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.4.dev2 +current_version = 0.3.4 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 60c97534..7b243501 100644 --- a/setup.py +++ b/setup.py @@ -100,6 +100,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/MLBlocks', - version='0.3.4.dev2', + version='0.3.4', zip_safe=False, ) From 6ede62caed212b84067021fca2d3b29d187a8554 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 4 Nov 2019 10:05:40 -0500 Subject: [PATCH 074/160] =?UTF-8?q?Bump=20version:=200.3.4=20=E2=86=92=200?= =?UTF-8?q?.3.5.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index e4aa9838..618e7a55 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.3.4' +__version__ = '0.3.5.dev0' __all__ = [ 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index 709511b4..61208b1f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.4 +current_version = 0.3.5.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 7b243501..09483fb3 100644 --- a/setup.py +++ b/setup.py @@ -100,6 +100,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/MLBlocks', - version='0.3.4', + version='0.3.5.dev0', zip_safe=False, ) From 3ce7d89c3e81743c73400c0694ebb6e893acbc51 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 12 Dec 2019 15:38:54 +0100 Subject: [PATCH 075/160] Update paper references --- README.md | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 19f740ed..7c152fa3 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Pipelines and Primitives for Machine Learning and Data Science. * Free software: MIT license * Documentation: https://HDI-Project.github.io/MLBlocks -- Homepage: https://github.com/HDI-Project/MLBlocks +* Homepage: https://github.com/HDI-Project/MLBlocks # MLBlocks @@ -237,10 +237,33 @@ If you want to learn more about how to tune the pipeline hyperparameters, save a the pipelines using JSON annotations or build complex multi-branched pipelines, please check our [documentation](https://HDI-Project.github.io/MLBlocks). -# History +## Citing MLBlocks + +If you use MLBlocks, please consider citing our related papers. + +For the current design of MLBlocks and its usage within the larger *Machine Learning Bazaar* project at +the MIT Data To AI Lab, please see: + +Micah J. Smith, Carles Sala, James Max Kanter, and Kalyan Veeramachaneni. ["The Machine Learning Bazaar: +Harnessing the ML Ecosystem for Effective System Development."](https://arxiv.org/abs/1905.08942) arXiv +Preprint 1905.08942. 2019. + +``` bibtex +@article{smith2019mlbazaar, + author = {Smith, Micah J. and Sala, Carles and Kanter, James Max and Veeramachaneni, Kalyan}, + title = {The Machine Learning Bazaar: Harnessing the ML Ecosystem for Effective System Development}, + journal = {arXiv e-prints}, + year = {2019}, + eid = {arXiv:1905.08942}, + pages = {arXiv:1905.08942}, + archivePrefix = {arXiv}, + eprint = {1905.08942}, +} +``` + +For the first MLBlocks version from 2015, designed for only multi table, multi entity temporal data, please +refer to Bryan Collazo’s thesis: -In its first iteration in 2015, MLBlocks was designed for only multi table, multi entity temporal -data. A good reference to see our design rationale at that time is Bryan Collazo’s thesis: * [Machine learning blocks](https://dai.lids.mit.edu/wp-content/uploads/2018/06/Mlblocks_Bryan.pdf). Bryan Collazo. Masters thesis, MIT EECS, 2015. From 949c8b1d36abe3792e38bed3501645fde279a075 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 12 Dec 2019 15:53:36 +0100 Subject: [PATCH 076/160] Restrict dependency versions --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 09483fb3..1e8ef2ad 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ 'urllib3>=1.20,<1.25', 'setuptools>=41.0.0', 'numpy<1.17', + 'python-dateutil<2.8.1,>=2.1', ] From 6b8381a069e235d8083a02cac0e72550db3955e2 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 19 Dec 2019 19:18:09 +0100 Subject: [PATCH 077/160] Allow loading from json. Deprecate old methods --- mlblocks/discovery.py | 44 ++++++++++++++---------------------- mlblocks/mlpipeline.py | 11 +++++++++ tests/test_discovery.py | 49 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 27 deletions(-) diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py index 9a1dbef5..24a469da 100644 --- a/mlblocks/discovery.py +++ b/mlblocks/discovery.py @@ -198,6 +198,12 @@ def get_pipelines_paths(): return _PIPELINES_PATHS + _load_entry_points('pipelines') +def _load_json(json_path): + with open(json_path, 'r') as json_file: + LOGGER.debug('Loading %s', json_path) + return json.load(json_file) + + def _load(name, paths): """Locate and load the JSON annotation in any of the given paths. @@ -206,8 +212,7 @@ def _load(name, paths): Args: name (str): - name of the JSON to look for. The name should not contain the - ``.json`` extension, as it will be added dynamically. + Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension. paths (list): list of paths where the primitives will be looked for. @@ -215,6 +220,9 @@ def _load(name, paths): dict: The content of the JSON annotation file loaded into a dict. """ + if os.path.isfile(name): + return _load_json(name) + for base_path in paths: parts = name.split('.') number_of_parts = len(parts) @@ -225,12 +233,7 @@ def _load(name, paths): json_path = os.path.join(folder, filename) if os.path.isfile(json_path): - with open(json_path, 'r') as json_file: - LOGGER.debug('Loading %s from %s', name, json_path) - return json.load(json_file) - - -_PRIMITIVES = dict() + return _load_json(json_path) def load_primitive(name): @@ -241,8 +244,7 @@ def load_primitive(name): Args: name (str): - name of the JSON to look for. The name should not contain the - ``.json`` extension, as it will be added dynamically. + Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension. Returns: dict: @@ -252,20 +254,13 @@ def load_primitive(name): ValueError: A ``ValueError`` will be raised if the primitive cannot be found. """ - primitive = _PRIMITIVES.get(name) + primitive = _load(name, get_primitives_paths()) if primitive is None: - primitive = _load(name, get_primitives_paths()) - if primitive is None: - raise ValueError("Unknown primitive: {}".format(name)) - - _PRIMITIVES[name] = primitive + raise ValueError("Unknown primitive: {}".format(name)) return primitive -_PIPELINES = dict() - - def load_pipeline(name): """Locate and load the pipeline JSON annotation. @@ -274,8 +269,7 @@ def load_pipeline(name): Args: name (str): - name of the JSON to look for. The name should not contain the - ``.json`` extension, as it will be added dynamically. + Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension. Returns: dict: @@ -285,13 +279,9 @@ def load_pipeline(name): ValueError: A ``ValueError`` will be raised if the pipeline cannot be found. """ - pipeline = _PIPELINES.get(name) + pipeline = _load(name, get_pipelines_paths()) if pipeline is None: - pipeline = _load(name, get_pipelines_paths()) - if pipeline is None: - raise ValueError("Unknown pipeline: {}".format(name)) - - _PIPELINES[name] = pipeline + raise ValueError("Unknown pipeline: {}".format(name)) return pipeline diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 21aa7ecc..962d7c19 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -5,6 +5,7 @@ import json import logging import re +import warnings from collections import Counter, OrderedDict, defaultdict from copy import deepcopy @@ -814,6 +815,11 @@ def from_dict(cls, metadata): A new MLPipeline instance with the details found in the given specification dictionary. """ + warnings.warn( + 'MLPipeline.form_dict(pipeline_dict) is deprecated and will be removed in a ' + 'later release. Please use MLPipeline(dict) instead,', + DeprecationWarning + ) return cls(metadata) @classmethod @@ -831,6 +837,11 @@ def load(cls, path): A new MLPipeline instance with the specification found in the JSON file. """ + warnings.warn( + 'MLPipeline.load(path) is deprecated and will be removed in a later release. ' + 'Please use MLPipeline(path) instead,', + DeprecationWarning + ) with open(path, 'r') as in_file: metadata = json.load(in_file) diff --git a/tests/test_discovery.py b/tests/test_discovery.py index dc3eca87..a11fc02c 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -162,6 +162,55 @@ def test__load_success(): assert primitive == loaded +def test__load_json_path(): + primitive = { + 'name': 'temp.primitive', + 'primitive': 'temp.primitive' + } + + with tempfile.TemporaryDirectory() as tempdir: + paths = [tempdir] + primitive_path = os.path.join(tempdir, 'temp.primitive.json') + with open(primitive_path, 'w') as primitive_file: + json.dump(primitive, primitive_file, indent=4) + + loaded = discovery._load(primitive_path, paths) + + assert primitive == loaded + + +def _load(name, paths): + """Locate and load the JSON annotation in any of the given paths. + + All the given paths will be scanned to find a JSON file with the given name, + and as soon as a JSON with the given name is found it is returned. + + Args: + name (str): + Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension. + paths (list): + list of paths where the primitives will be looked for. + + Returns: + dict: + The content of the JSON annotation file loaded into a dict. + """ + if os.path.isfile(name): + return _load_json(name) + + for base_path in paths: + parts = name.split('.') + number_of_parts = len(parts) + + for folder_parts in range(number_of_parts): + folder = os.path.join(base_path, *parts[:folder_parts]) + filename = '.'.join(parts[folder_parts:]) + '.json' + json_path = os.path.join(folder, filename) + + if os.path.isfile(json_path): + return _load_json(json_path) + + @patch('mlblocks.discovery.get_primitives_paths') @patch('mlblocks.discovery._load') From be684dd593f89cd21bd74efb53d6aa97b8c02970 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 19 Dec 2019 19:26:05 +0100 Subject: [PATCH 078/160] Remove unneeded code --- tests/test_discovery.py | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/tests/test_discovery.py b/tests/test_discovery.py index a11fc02c..25e6e444 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -162,6 +162,7 @@ def test__load_success(): assert primitive == loaded + def test__load_json_path(): primitive = { 'name': 'temp.primitive', @@ -179,39 +180,6 @@ def test__load_json_path(): assert primitive == loaded -def _load(name, paths): - """Locate and load the JSON annotation in any of the given paths. - - All the given paths will be scanned to find a JSON file with the given name, - and as soon as a JSON with the given name is found it is returned. - - Args: - name (str): - Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension. - paths (list): - list of paths where the primitives will be looked for. - - Returns: - dict: - The content of the JSON annotation file loaded into a dict. - """ - if os.path.isfile(name): - return _load_json(name) - - for base_path in paths: - parts = name.split('.') - number_of_parts = len(parts) - - for folder_parts in range(number_of_parts): - folder = os.path.join(base_path, *parts[:folder_parts]) - filename = '.'.join(parts[folder_parts:]) + '.json' - json_path = os.path.join(folder, filename) - - if os.path.isfile(json_path): - return _load_json(json_path) - - - @patch('mlblocks.discovery.get_primitives_paths') @patch('mlblocks.discovery._load') def test__load_primitive_value_error(load_mock, gpp_mock): From 1920227548edbb11b851b1864044cabc577b8e03 Mon Sep 17 00:00:00 2001 From: Erica Chiu Date: Thu, 9 Jan 2020 11:41:23 -0500 Subject: [PATCH 079/160] Add get_inputs function --- mlblocks/mlpipeline.py | 78 ++++++++++++++++++++++++++++ tests/test_mlpipeline.py | 107 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 185 insertions(+) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 21aa7ecc..dce30cfe 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -154,6 +154,45 @@ def _get_block_outputs(self, block_name): return outputs + def _get_block_outputs_dict(self, block_name): + """Get dictionary of output variables for the given block.""" + block = self.blocks[block_name] + outputs = deepcopy(block.produce_output) + output_names = self.output_names.get(block_name, dict()) + output_dict = {} + for output in outputs: + name = output['name'] + context_name = output_names.get(name, name) + output_dict[context_name] = output + + return output_dict + + def _get_block_inputs_dict(self, block_name): + """Get dictionary of input variables for the given block.""" + block = self.blocks[block_name] + print(block.produce_args) + inputs = deepcopy(block.produce_args) + input_names = self.input_names.get(block_name, dict()) + inputs_dict = {} + for input_value in inputs: + name = input_value['name'] + context_name = input_names.get(name, name) + inputs_dict[context_name] = input_value + return inputs_dict + + def _get_block_fit_inputs_dict(self, block_name): + """Get the list of fit input variables for the given block.""" + block = self.blocks[block_name] + fit_inputs = deepcopy(block.fit_args) + input_names = self.input_names.get(block_name, dict()) + fit_inputs_dict = {} + for fit_input in fit_inputs: + name = fit_input['name'] + context_name = input_names.get(name, name) + fit_inputs_dict[context_name] = fit_input + + return fit_inputs_dict + def _get_outputs(self, pipeline, outputs): """Get the output definitions from the pipeline dictionary. @@ -224,6 +263,45 @@ def _get_str_output(self, output): raise ValueError('Invalid Output Specification: {}'.format(output)) + def get_inputs(self, fit=True): + """Get a dictionary mapping all input variable names required by the + pipeline to a dictionary with their specified information. + + Can be specified to include fit arguments. + + Args: + fit (bool): + Optional argument to include fit arguments or not. Defaults to ``True``. + + Returns: + dictionary: + A dictionary mapping every input variable's name to a dictionary + specifying the information corresponding to that input variable. + Each dictionary contains the entry ``name``, as + well as any other metadata that may have been included in the + pipeline inputs specification. + + Raises: + ValueError: + If an input specification is not valid. + TypeError: + If the type of a specification is not an str or an int. + """ + inputs = dict() + for block_name in reversed(self.blocks.keys()): # iterates through pipeline backwards + produce_outputs = self._get_block_outputs_dict(block_name) + for produce_output_name in produce_outputs.keys(): + inputs.pop(produce_output_name, None) + + produce_inputs = self._get_block_inputs_dict(block_name) + inputs.update(produce_inputs) + + if fit: + fit_inputs = self._get_block_fit_inputs_dict(block_name) + inputs.update(fit_inputs) + + return inputs + def get_outputs(self, outputs='default'): """Get the list of output variables that correspond to the specified outputs. diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index f2edc36f..88cb8c44 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -549,6 +549,113 @@ def test_get_output_variables(self): assert names == ['a_variable'] + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_inputs_fit(self): + expected = { + 'input': { + 'name': 'input', + 'type': 'whatever', + }, + 'fit_input': { + 'name': 'fit_input', + 'type': 'whatever', + }, + 'another_input': { + 'name': 'another_input', + 'type': 'another_whatever', + } + + } + + pipeline = MLPipeline(['a_primitive', 'another_primitive']) + + pipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + + pipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + + pipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'output', + 'type': 'another_whatever' + } + ] + + pipeline.blocks['another_primitive#1'].produce_args = [ + { + 'name': 'output', + 'type': 'another_whatever' + }, + { + 'name': 'another_input', + 'type': 'another_whatever' + } + ] + + inputs = pipeline.get_inputs() + assert inputs == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_inputs_no_fit(self): + expected = { + 'input': { + 'name': 'input', + 'type': 'whatever', + }, + 'another_input': { + 'name': 'another_input', + 'type': 'another_whatever', + } + + } + + pipeline = MLPipeline(['a_primitive', 'another_primitive']) + + pipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + + pipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + + pipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'output', + 'type': 'another_whatever' + } + ] + + pipeline.blocks['another_primitive#1'].produce_args = [ + { + 'name': 'output', + 'type': 'another_whatever' + }, + { + 'name': 'another_input', + 'type': 'another_whatever' + } + ] + + inputs = pipeline.get_inputs(fit=False) + + assert inputs == expected + def test_fit(self): pass From 0d2108f00b5daa62aa37b6ce715ac7ea01bc0b3f Mon Sep 17 00:00:00 2001 From: Erica Chiu Date: Thu, 9 Jan 2020 11:46:21 -0500 Subject: [PATCH 080/160] Remove incorrect docstring --- mlblocks/mlpipeline.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index dce30cfe..7f23bf28 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -280,12 +280,6 @@ def get_inputs(self, fit=True): Each dictionary contains the entry ``name``, as well as any other metadata that may have been included in the pipeline inputs specification. - - Raises: - ValueError: - If an input specification is not valid. - TypeError: - If the type of a specification is not an str or an int. """ inputs = dict() for block_name in reversed(self.blocks.keys()): # iterates through pipeline backwards From 4f456bdc3c5cb7200d0ca5e36a0ba05ec1e68e9f Mon Sep 17 00:00:00 2001 From: Erica Chiu Date: Mon, 13 Jan 2020 17:09:40 -0500 Subject: [PATCH 081/160] Address comments --- mlblocks/mlpipeline.py | 85 +++++++++++++++++----------------------- tests/test_mlpipeline.py | 23 +++++++++++ 2 files changed, 60 insertions(+), 48 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 7f23bf28..fbd5bcf0 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -144,54 +144,35 @@ def _get_pipeline_dict(pipeline, primitives): def _get_block_outputs(self, block_name): """Get the list of output variables for the given block.""" - block = self.blocks[block_name] - outputs = deepcopy(block.produce_output) - output_names = self.output_names.get(block_name, dict()) - for output in outputs: - name = output['name'] - context_name = output_names.get(name, name) + outputs = self._get_block_variables(block_name, + 'produce_output', + self.output_names.get(block_name, dict())) + for context_name, output in outputs.items(): output['variable'] = '{}.{}'.format(block_name, context_name) - return outputs - - def _get_block_outputs_dict(self, block_name): - """Get dictionary of output variables for the given block.""" - block = self.blocks[block_name] - outputs = deepcopy(block.produce_output) - output_names = self.output_names.get(block_name, dict()) - output_dict = {} - for output in outputs: - name = output['name'] - context_name = output_names.get(name, name) - output_dict[context_name] = output + return list(outputs.values()) - return output_dict + def _get_block_variables(self, block_name, variables_attr, names): + """Get dictionary of variable names to the variable for a given block - def _get_block_inputs_dict(self, block_name): - """Get dictionary of input variables for the given block.""" - block = self.blocks[block_name] - print(block.produce_args) - inputs = deepcopy(block.produce_args) - input_names = self.input_names.get(block_name, dict()) - inputs_dict = {} - for input_value in inputs: - name = input_value['name'] - context_name = input_names.get(name, name) - inputs_dict[context_name] = input_value - return inputs_dict - - def _get_block_fit_inputs_dict(self, block_name): - """Get the list of fit input variables for the given block.""" + Args: + block_name (str): + Name of the block for which to get the specification + variables_attr (str): + Name of the attribute that has the variables list. It can be + `fit_args`, `produce_args` or `produce_output`. + names (dict): + Dictionary used to translate the variable names. + """ block = self.blocks[block_name] - fit_inputs = deepcopy(block.fit_args) - input_names = self.input_names.get(block_name, dict()) - fit_inputs_dict = {} - for fit_input in fit_inputs: - name = fit_input['name'] - context_name = input_names.get(name, name) - fit_inputs_dict[context_name] = fit_input + variables = deepcopy(getattr(block, variables_attr)) + variable_dict = {} + for variable in variables: + name = variable['name'] + context_name = names.get(name, name) + variable_dict[context_name] = variable - return fit_inputs_dict + return variable_dict def _get_outputs(self, pipeline, outputs): """Get the output definitions from the pipeline dictionary. @@ -264,10 +245,11 @@ def _get_str_output(self, output): raise ValueError('Invalid Output Specification: {}'.format(output)) def get_inputs(self, fit=True): - """Get a dictionary mapping all input variable names required by the - pipeline to a dictionary with their specified information. + """Get a relation of all the input variables required by this pipeline. - Can be specified to include fit arguments. + The result is a dictionary that maps each variable name with their + specified information. + Optionally include the fit arguments. Args: fit (bool): @@ -283,15 +265,22 @@ def get_inputs(self, fit=True): """ inputs = dict() for block_name in reversed(self.blocks.keys()): # iterates through pipeline backwards - produce_outputs = self._get_block_outputs_dict(block_name) + produce_outputs = self._get_block_variables(block_name, + 'produce_output', + self.output_names.get(block_name, dict())) + for produce_output_name in produce_outputs.keys(): inputs.pop(produce_output_name, None) - produce_inputs = self._get_block_inputs_dict(block_name) + produce_inputs = self._get_block_variables(block_name, + 'produce_args', + self.input_names.get(block_name, dict())) inputs.update(produce_inputs) if fit: - fit_inputs = self._get_block_fit_inputs_dict(block_name) + fit_inputs = self._get_block_variables(block_name, + 'fit_args', + self.input_names.get(block_name, dict())) inputs.update(fit_inputs) return inputs diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index 88cb8c44..4fb779b8 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -549,6 +549,29 @@ def test_get_output_variables(self): assert names == ['a_variable'] + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test__get_block_variables(self): + expected = { + 'name_output': { + 'name': 'output', + 'type': 'whatever', + } + } + + pipeline = MLPipeline(['a_primitive']) + + pipeline.blocks['a_primitive#1'].produce_outputs = [ + { + 'name': 'output', + 'type': 'whatever' + } + ] + + outputs = pipeline._get_block_variables('a_primitive#1', + 'produce_outputs', + {'output': 'name_output'}) + assert outputs == expected + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_inputs_fit(self): expected = { From 1dd0f372111a775a1d27b2c77641f7fa884a552f Mon Sep 17 00:00:00 2001 From: Erica Chiu Date: Tue, 14 Jan 2020 10:19:50 -0500 Subject: [PATCH 082/160] Change indenting --- AUTHORS.rst | 1 + mlblocks/mlpipeline.py | 32 ++++++++++++++++++++------------ tests/test_mlpipeline.py | 8 +++++--- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index eb8885c9..7245c735 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -10,3 +10,4 @@ Contributors * William Xue * Akshay Ravikumar * Laura Gustafson +* Erica Chiu diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index fbd5bcf0..35273642 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -144,9 +144,11 @@ def _get_pipeline_dict(pipeline, primitives): def _get_block_outputs(self, block_name): """Get the list of output variables for the given block.""" - outputs = self._get_block_variables(block_name, - 'produce_output', - self.output_names.get(block_name, dict())) + outputs = self._get_block_variables( + block_name, + 'produce_output', + self.output_names.get(block_name, dict()) + ) for context_name, output in outputs.items(): output['variable'] = '{}.{}'.format(block_name, context_name) @@ -265,22 +267,28 @@ def get_inputs(self, fit=True): """ inputs = dict() for block_name in reversed(self.blocks.keys()): # iterates through pipeline backwards - produce_outputs = self._get_block_variables(block_name, - 'produce_output', - self.output_names.get(block_name, dict())) + produce_outputs = self._get_block_variables( + block_name, + 'produce_output', + self.output_names.get(block_name, dict()) + ) for produce_output_name in produce_outputs.keys(): inputs.pop(produce_output_name, None) - produce_inputs = self._get_block_variables(block_name, - 'produce_args', - self.input_names.get(block_name, dict())) + produce_inputs = self._get_block_variables( + block_name, + 'produce_args', + self.input_names.get(block_name, dict()) + ) inputs.update(produce_inputs) if fit: - fit_inputs = self._get_block_variables(block_name, - 'fit_args', - self.input_names.get(block_name, dict())) + fit_inputs = self._get_block_variables( + block_name, + 'fit_args', + self.input_names.get(block_name, dict()) + ) inputs.update(fit_inputs) return inputs diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index 4fb779b8..340a3838 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -567,9 +567,11 @@ def test__get_block_variables(self): } ] - outputs = pipeline._get_block_variables('a_primitive#1', - 'produce_outputs', - {'output': 'name_output'}) + outputs = pipeline._get_block_variables( + 'a_primitive#1', + 'produce_outputs', + {'output': 'name_output'} + ) assert outputs == expected @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) From 93994e2a0c177fb8bab33f7fe57dd1eaae61a708 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 23 Jan 2020 20:43:15 +0100 Subject: [PATCH 083/160] Add notebook tutorials and examples --- Makefile | 4 + ...ification.categorical_encoder.xgboost.json | 16 + .../mlblocks.examples.ClassPrimitive.json | 104 ++ .../mlblocks.examples.function_primitive.json | 86 ++ .../tutorials/1. Using and MLPipeline.ipynb | 633 +++++++++++++ .../2. Finding and Loading a Pipeline.ipynb | 123 +++ .... Setting MLPipeline Hyperparameters.ipynb | 430 +++++++++ .../4. Saving and Loading a Pipeline.ipynb | 181 ++++ examples/tutorials/5. Tuning a Pipeline.ipynb | 463 +++++++++ ...or the best pipeline with BTBSession.ipynb | 895 ++++++++++++++++++ setup.py | 7 + 11 files changed, 2942 insertions(+) create mode 100644 examples/pipelines/single_table.classification.categorical_encoder.xgboost.json create mode 100644 examples/primitives/mlblocks.examples.ClassPrimitive.json create mode 100644 examples/primitives/mlblocks.examples.function_primitive.json create mode 100644 examples/tutorials/1. Using and MLPipeline.ipynb create mode 100644 examples/tutorials/2. Finding and Loading a Pipeline.ipynb create mode 100644 examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb create mode 100644 examples/tutorials/4. Saving and Loading a Pipeline.ipynb create mode 100644 examples/tutorials/5. Tuning a Pipeline.ipynb create mode 100644 examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb diff --git a/Makefile b/Makefile index 6e8dd203..bfc1a5f6 100644 --- a/Makefile +++ b/Makefile @@ -72,6 +72,10 @@ clean: clean-build clean-pyc clean-test clean-coverage clean-docs ## remove all install: clean-build clean-pyc ## install the package to the active Python's site-packages pip install . +.PHONY: install-examples +install-examples: clean-build clean-pyc ## install the package and the examples dependencies + pip install .[examples] + .PHONY: install-test install-test: clean-build clean-pyc ## install the package and test dependencies pip install .[test] diff --git a/examples/pipelines/single_table.classification.categorical_encoder.xgboost.json b/examples/pipelines/single_table.classification.categorical_encoder.xgboost.json new file mode 100644 index 00000000..4dca4002 --- /dev/null +++ b/examples/pipelines/single_table.classification.categorical_encoder.xgboost.json @@ -0,0 +1,16 @@ +{ + "metadata": { + "data_modality": "single_table", + "task_type": "classification" + }, + "validation": { + "dataset": "census" + }, + "primitives": [ + "mlprimitives.custom.preprocessing.ClassEncoder", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "sklearn.impute.SimpleImputer", + "xgboost.XGBClassifier", + "mlprimitives.custom.preprocessing.ClassDecoder" + ] +} diff --git a/examples/primitives/mlblocks.examples.ClassPrimitive.json b/examples/primitives/mlblocks.examples.ClassPrimitive.json new file mode 100644 index 00000000..6c29e51e --- /dev/null +++ b/examples/primitives/mlblocks.examples.ClassPrimitive.json @@ -0,0 +1,104 @@ +{ + "name": "the_primitive_name", + "primitive": "full.python.path.to.AClass", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "keyword": "optional_name_of_the_fit_method_argument", + "description": "each input can be described", + "type": "pandas.DataFrame" + }, + { + "name": "y", + "description": "each input can be described", + "default": "default_value_for_this_argument", + "type": "pandas.Series" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "keyword": "optional_name_of_the_produce_method_argument", + "description": "each input can be described", + "type": "DataFrame" + } + ], + "output": [ + { + "name": "y", + "descrtiption": "each output argument can be described", + "type": "Series" + } + ] + }, + "hyperparameters": { + "fixed": { + "a_required_hyperparameter": { + "descrtiption": "this is a non tunable hyperparameter that needs to be specified by the user because it does not have a default value", + "type": "int" + }, + "an_optional_hyperparameter": { + "descrtiption": "this is a non tunable hyperparameter that is optional because it has a default value", + "type": "int", + "default": 1 + } + }, + "tunable": { + "a_simple_range_hyperparameter": { + "description": "hyperparameter documentation can be put here", + "default": 1, + "type": "int", + "range": [1, 10] + }, + "a_categorical_hyperparameter_of_type_int": { + "description": "Note that it has the field `values` instead of `range`", + "default": 1, + "type": "int", + "values": [1, 3, 7, 10] + }, + "a_categorical_hyperparameter_of_type_str": { + "default": "a", + "type": "str", + "values": ["a", "b", "c"] + }, + "a_multi_type_hyperprameter": { + "description": "this is a hyperparameter that allows more than one type", + "type": "multitype", + "default": "auto", + "types": { + "int": { + "description": "documentation can also be included here", + "range": [1, 10] + }, + "string": { + "values": ["some", "string", "values"] + } + } + }, + "conditional_hyperparameter": { + "description": "this is a hyperparameter whose valid values depend on the value of another hyperpameter", + "type": "conditional", + "condition": "the_name_of_the_other_hyperparameter", + "values": { + "a": { + "description": "this hyperparameter definition will be used if the value of the other hyperparameter is `a`", + "type": "int", + "default": 0, + "range": [0, 10] + }, + "*": { + "description": "this will be used only if the value does not match any other definition", + "type": "float", + "default": 0.0, + "range": [0.0, 1.0] + } + } + } + } + } +} diff --git a/examples/primitives/mlblocks.examples.function_primitive.json b/examples/primitives/mlblocks.examples.function_primitive.json new file mode 100644 index 00000000..f3627bd9 --- /dev/null +++ b/examples/primitives/mlblocks.examples.function_primitive.json @@ -0,0 +1,86 @@ +{ + "name": "the_primitive_name", + "primitive": "full.python.path.to.a_function", + "produce": { + "args": [ + { + "name": "X", + "keyword": "optional_name_of_the_produce_method_argument", + "description": "each input can be described", + "type": "DataFrame" + } + ], + "output": [ + { + "descrtiption": "each output argument can be described", + "name": "y", + "type": "Series" + } + ] + }, + "hyperparameters": { + "fixed": { + "a_required_hyperparameter": { + "descrtiption": "this is a non tunable hyperparameter that needs to be specified by the user, because it does not have a default value", + "type": "int" + }, + "an_optional_hyperparameter": { + "descrtiption": "this is a non tunable hyperparameter that is optional, because it has a default value", + "type": "int", + "default": 1 + } + }, + "tunable": { + "a_simple_range_hyperparameter": { + "description": "hyperparameter documentation can be put here", + "default": 1, + "type": "int", + "range": [1, 10] + }, + "a_categorical_hyperparameter_of_type_int": { + "description": "Note that it has the filed `values` instead of `range`", + "default": 1, + "type": "int", + "values": [1, 3, 7, 10] + }, + "a_categorical_hyperparameter_of_type_str": { + "default": "a", + "type": "str", + "values": ["a", "b", "c"] + }, + "a_multi_type_hyperprameter": { + "description": "this is a hyperparameter that allows more than one type", + "type": "multitype", + "default": "auto", + "types": { + "int": { + "description": "documentation can also be included here", + "range": [1, 10] + }, + "string": { + "values": ["some", "string", "values"] + } + } + }, + "conditional_hyperparameter": { + "description": "this is a hyperparameter whose valid values depend on the value of another hyperpameter", + "type": "conditional", + "condition": "the_name_of_the_other_hyperparameter", + "values": { + "a": { + "description": "this hyperparameter definition will be used if the value of the other hyperparameter is `a`", + "type": "int", + "default": 0, + "range": [0, 10] + }, + "*": { + "description": "this will be used only if the value does not match any other definition", + "type": "float", + "default": 0.0, + "range": [0.0, 1.0] + } + } + } + } + } +} diff --git a/examples/tutorials/1. Using and MLPipeline.ipynb b/examples/tutorials/1. Using and MLPipeline.ipynb new file mode 100644 index 00000000..733fb42d --- /dev/null +++ b/examples/tutorials/1. Using and MLPipeline.ipynb @@ -0,0 +1,633 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using an MLPipeline\n", + "\n", + "In this short guide we will go over the basic MLPipeline functionality.\n", + "\n", + "We will:\n", + "\n", + "1. Load a demo dataset.\n", + "2. Build a pipeline.\n", + "3. Explore the pipeline primitives, inputs and outputs.\n", + "4. Fit the pipeline to the dataset.\n", + "5. Make predictions using the fitted pipeline.\n", + "6. Evaluate the pipeline performance." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the Dataset\n", + "\n", + "The first step will be to load the Census dataset using the function provided by mlprimitives" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from mlprimitives.datasets import load_dataset\n", + "\n", + "dataset = load_dataset('census')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This version of the Census dataset is prepared as a Classification (Supervised) Problem,\n", + "and has an input matrix `X` and an expected outcome `y` array." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adult Census dataset.\n", + "\n", + " Predict whether income exceeds $50K/yr based on census data. Also known as \"Adult\" dataset.\n", + "\n", + " Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean\n", + " records was extracted using the following conditions: ((AAGE>16) && (AGI>100) &&\n", + " (AFNLWGT>1)&& (HRSWK>0))\n", + "\n", + " Prediction task is to determine whether a person makes over 50K a year.\n", + "\n", + " source: \"UCI\n", + " sourceURI: \"https://archive.ics.uci.edu/ml/datasets/census+income/"\n", + " \n", + "Data Modality: single_table\n", + "Task Type: classification\n", + "Task Subtype: binary\n", + "Data shape: (32561, 14)\n", + "Target shape: (32561,)\n", + "Metric: accuracy_score\n", + "Extras: \n" + ] + } + ], + "source": [ + "dataset.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The data from the dataset can explored by looking at its `.data` and `.target` attributes." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-country
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba
\n", + "
" + ], + "text/plain": [ + " age workclass fnlwgt education education-num \\\n", + "0 39 State-gov 77516 Bachelors 13 \n", + "1 50 Self-emp-not-inc 83311 Bachelors 13 \n", + "2 38 Private 215646 HS-grad 9 \n", + "3 53 Private 234721 11th 7 \n", + "4 28 Private 338409 Bachelors 13 \n", + "\n", + " marital-status occupation relationship race sex \\\n", + "0 Never-married Adm-clerical Not-in-family White Male \n", + "1 Married-civ-spouse Exec-managerial Husband White Male \n", + "2 Divorced Handlers-cleaners Not-in-family White Male \n", + "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", + "4 Married-civ-spouse Prof-specialty Wife Black Female \n", + "\n", + " capital-gain capital-loss hours-per-week native-country \n", + "0 2174 0 40 United-States \n", + "1 0 0 13 United-States \n", + "2 0 0 40 United-States \n", + "3 0 0 40 United-States \n", + "4 0 0 40 Cuba " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K'], dtype=object)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.target[0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset data can also be splitted in multipe parts for cross validation using the `dataset.get_splits` method.\n", + "\n", + "For this demo we will be making only one split, which is equivalent to a simple train/test holdout partitioning." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = dataset.get_splits(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(24420, 14)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(8141, 14)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build a pipeline\n", + "\n", + "Once we have the dataset we will build a pipeline that works with it.\n", + "\n", + "In this case, we will be creating a short pipeline that uses the following primitives:\n", + "\n", + "- `ClassEncoder` from `mlprimitives`, which encodes the target variable `y` as integers.\n", + "- `CategoricaEncoder` from `mlprimitives`, which encodes all the categorical variables from the feature matrix `X`\n", + " using one-hot encoding.\n", + "- `SimpleImputer` from `sklearn`, which imputes any null values that may exist in the feature matrix `X`\n", + "- `XGBClassifier` from `xgboost`, which learns to predict the target variable `y` sing the feature matrix `X`.\n", + "- `ClassDecoder` from `mlprimitives`, which reverts the `ClassEncoder` transformation to return the original\n", + " target labels instead of integers." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "primitives = [\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'xgboost.XGBClassifier',\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder'\n", + "]\n", + "pipeline = MLPipeline(primitives)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore the Pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Primitives" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see the primitives included in this pipeline by having a look at its `primitives` attribute." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['mlprimitives.custom.preprocessing.ClassEncoder',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'xgboost.XGBClassifier',\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.primitives" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inputs\n", + "\n", + "We can also see the inputs of the pipeline using the `get_inputs` method.\n", + "\n", + "This will traverse the pipeline execution graph and show all the variables that need to be\n", + "provided by the user in order to fit this pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'X': {'name': 'X', 'type': 'DataFrame'},\n", + " 'y': {'name': 'y', 'type': 'ndarray'}}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.get_inputs()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, we can pass the `fit=False` argument, which will give us the variables needed\n", + "in order to make predictions." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'X': {'name': 'X', 'type': 'DataFrame'},\n", + " 'y': {'name': 'y', 'default': None, 'type': 'ndarray'}}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.get_inputs(fit=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note how the `fit` method expects two variables `X` and `y`, while the `predict`\n", + "method only needs `X`, as the `y` variable has a default value of `None`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Outputs\n", + "\n", + "Equally, we can see the outputs that the pipeline will return when used to make predictions." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'name': 'y',\n", + " 'type': 'ndarray',\n", + " 'variable': 'mlprimitives.custom.preprocessing.ClassDecoder#1.y'}]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.get_outputs()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fit the Pipeline to the Dataset\n", + "\n", + "Now that the pipeline is ready and we know its inputs and outputs, we can fit it to the\n", + "dataset by passing the training `X` and `y` variables to its `fit` method." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Make Predictions\n", + "\n", + "After the pipelines finished fitting, we can try to predict the `y_test` array values by\n", + "passing the `X_test` matrix to the `pipeline.predict` method." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "predictions = pipeline.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' >50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions[0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluating the pipeline performance\n", + "\n", + "Now we can compare the predicted array with the actual test array to see how well\n", + "our pipeline performed.\n", + "\n", + "This can be done using the `dataset.score` method, which provides a suitable scoring\n", + "function for this kind of data and problem.\n", + "In this case, the dataset is just computing the accuracy score." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8602137329566393" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.score(y_test, predictions)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb new file mode 100644 index 00000000..a94c48bc --- /dev/null +++ b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb @@ -0,0 +1,123 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Finding and Loading a Pipeline\n", + "\n", + "In this short tutorial we will show you how to search for pipelines suitable to solve\n", + "your prediction problem." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to find a suitable pipeline, the first thing we need is to identify\n", + "the type of problem (data modality + task type) that we are facing.\n", + "\n", + "This is a full list of current data modalities and task types that we cover:\n", + "\n", + "| Problem Type | Data Modality | Task Type |\n", + "|:-------------------------------------|:--------------|:------------------------|\n", + "| Single Table Classification | single_table | classification |\n", + "| Single Table Regression | single_table | regression |\n", + "| Single Table Collaborative Filtering | single_table | collaborative_filtering |\n", + "| Multi Table Classification | multi_table | classification |\n", + "| Multi Table Regression | multi_table | regression |\n", + "| Time Series Classification | timeseries | classification |\n", + "| Time Series Regression | timeseries | regression |\n", + "| Time Series Forecasting | timeseries | forecasting |\n", + "| Time Series Anomaly Detection | timeseries | anomaly_detection |\n", + "| Image Classification | image | classification |\n", + "| Image Regression | image | regression |\n", + "| Graph Link Prediction | graph | link_prediction |\n", + "| Graph Vertex Nomination | graph | vertex_nomination |\n", + "| Graph Community Detection | graph | community_detection |\n", + "| Graph Matching | graph | graph_matching |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have identified our data modality and task type we can use the\n", + "`mlblocks.discovery.find_pipelines` function to find all the pipelines\n", + "that support this particular problem type.\n", + "\n", + "For example, if we are looking for a pipeline to work on Image Classification\n", + "we will do the following query." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['image.classification.hog.random_forest',\n", + " 'image.classification.hog.xgboost',\n", + " 'image.classification.resnet50.xgboost']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from mlblocks.discovery import find_pipelines\n", + "\n", + "filters = {\n", + " 'metadata.data_modality': 'image',\n", + " 'metadata.task_type': 'classification',\n", + "}\n", + "\n", + "find_pipelines(filters=filters)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After finding and choosing a pipeline, we can load it as an `MLPipeline`\n", + "by passing its name to the `MLPipeline`." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "pipeline = MLPipeline('image.classification.resnet50.xgboost')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb new file mode 100644 index 00000000..29f60a8f --- /dev/null +++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb @@ -0,0 +1,430 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Setting MLPipeline Hyperparameters\n", + "\n", + "In this short guide we will see how to modify the hyperparameters\n", + "of an MLPipeline in order to modify its behavior or performance.\n", + "\n", + "Note that some steps are not explained for simplicity. Full details\n", + "about them can be found in the previous parts of the tutorial.\n", + "\n", + "We will:\n", + "\n", + "1. Load a dataset and a Pipeline.\n", + "2. Explore the pipeline hyperparamters.\n", + "3. Reload the pipeline with different hyperparameters.\n", + "4. Evaluate the pipeline performance on the dataset.\n", + "5. Set different pipeline hyperparameters.\n", + "6. Re-evaluate the pipeline performance on the dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the Dataset and the Pipeline\n", + "\n", + "The first step will be to load the dataset and the pipeline that we will be using." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from mlprimitives.datasets import load_dataset\n", + "\n", + "dataset = load_dataset('census')\n", + "X_train, X_test, y_train, y_test = dataset.get_splits(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "pipeline = MLPipeline('single_table.classification.categorical_encoder.xgboost')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore the Pipeline Hyperparameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have loaded the pipeline, we can see the hyperparameters that it is using by\n", + "calling its `get_hyperparameters` method." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n", + " 'copy': True,\n", + " 'features': 'auto',\n", + " 'max_unique_ratio': 0,\n", + " 'max_labels': 0},\n", + " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n", + " 'fill_value': None,\n", + " 'verbose': False,\n", + " 'copy': True,\n", + " 'strategy': 'mean'},\n", + " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n", + " 'n_estimators': 100,\n", + " 'max_depth': 3,\n", + " 'learning_rate': 0.1,\n", + " 'gamma': 0,\n", + " 'min_child_weight': 1},\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.get_hyperparameters()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This will return us a dictionary that contains one entry for each step in the pipeline.\n", + "Each entry will also be a dictionary, indicating the names and the values of the hyperparameters of that step.\n", + "\n", + "**NOTE** that here we see the names of the pipeline steps, which are the primitive names with a numerical suffix that allows us to tell the difference between multiple steps that use the same primitive. \n", + "\n", + "Alternatively, for better compatibility with tuning systems like [BTB](https://github.com/HDI-Project/BTB)\n", + "that work with flat, one-level, dictionaries, the argument `flat=True` can be passed." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'keep'): False,\n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'copy'): True,\n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'features'): 'auto',\n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_unique_ratio'): 0,\n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 0,\n", + " ('sklearn.impute.SimpleImputer#1', 'missing_values'): nan,\n", + " ('sklearn.impute.SimpleImputer#1', 'fill_value'): None,\n", + " ('sklearn.impute.SimpleImputer#1', 'verbose'): False,\n", + " ('sklearn.impute.SimpleImputer#1', 'copy'): True,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('xgboost.XGBClassifier#1', 'n_jobs'): -1,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.get_hyperparameters(flat=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This will return us the same information as before, but organized a single one-level\n", + "dictionary where each key is a `tuple` containing both the name of the step and the hyperparameter." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting Pipeline hyperparameter values\n", + "\n", + "We can set some different hyperparameter values when loading the pipeline by adding the\n", + "`init_params` argument to `MLPipeline`.\n", + "\n", + "The `init_params` has to be a dictionary where each entry corresponds to the name of one of the\n", + "pipeline steps and each value is another dictionary indicating the hyperparameter values that we\n", + "want to use on that step.\n", + "\n", + "As an example, we will set a different imputer strategy and a different xgboost max dempt." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "init_params = {\n", + " 'sklearn.impute.SimpleImputer#1': {\n", + " 'strategy': 'median'\n", + " },\n", + " 'xgboost.XGBClassifier#1': {\n", + " 'max_depth': 4\n", + " }\n", + "}\n", + "pipeline = MLPipeline(\n", + " 'single_table.classification.categorical_encoder.xgboost',\n", + " init_params=init_params\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now see how the hyperparameters are different than before." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n", + " 'copy': True,\n", + " 'features': 'auto',\n", + " 'max_unique_ratio': 0,\n", + " 'max_labels': 0},\n", + " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n", + " 'fill_value': None,\n", + " 'verbose': False,\n", + " 'copy': True,\n", + " 'strategy': 'median'},\n", + " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n", + " 'max_depth': 4,\n", + " 'n_estimators': 100,\n", + " 'learning_rate': 0.1,\n", + " 'gamma': 0,\n", + " 'min_child_weight': 1},\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.get_hyperparameters()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluate the Pipeline performance\n", + "\n", + "We can now evaluate the pipeline performance to see what results these\n", + "hyperparameters produce." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8647586291610367" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.fit(X_train, y_train)\n", + "y_pred = pipeline.predict(X_test)\n", + "\n", + "dataset.score(y_test, y_pred)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting hyperparameter values\n", + "\n", + "Another way of setting the pipeline hyperparameters without having to recreate it\n", + "from scratch, is to use its `set_hyperparameters` method.\n", + "\n", + "In this case, we will change the CategoricalEncoder `max_labels` and the xgboost `learning_rate`." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "hyperparameters = {\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {\n", + " 'max_labels': 10\n", + " },\n", + " 'xgboost.XGBClassifier#1': {\n", + " 'learning_rate': 0.3\n", + " }\n", + "}\n", + "pipeline.set_hyperparameters(hyperparameters)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, the hyperparameters can be set using the `flat` format:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "hyperparameters = {\n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 10,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3\n", + "}\n", + "pipeline.set_hyperparameters(hyperparameters)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we can see how these hyperparameters now are different than before:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n", + " 'copy': True,\n", + " 'features': 'auto',\n", + " 'max_unique_ratio': 0,\n", + " 'max_labels': 10},\n", + " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n", + " 'fill_value': None,\n", + " 'verbose': False,\n", + " 'copy': True,\n", + " 'strategy': 'median'},\n", + " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n", + " 'max_depth': 4,\n", + " 'n_estimators': 100,\n", + " 'learning_rate': 0.3,\n", + " 'gamma': 0,\n", + " 'min_child_weight': 1},\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.get_hyperparameters()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluate the Pipeline performance\n", + "\n", + "We can now evaluate again the pipeline performance and see how the hyperparameter\n", + "change affected the pipeline performance." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.870531875690947" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.fit(X_train, y_train)\n", + "y_pred = pipeline.predict(X_test)\n", + "\n", + "dataset.score(y_test, y_pred)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb new file mode 100644 index 00000000..193daaf3 --- /dev/null +++ b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb @@ -0,0 +1,181 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Saving and Loading a Pipeline\n", + "\n", + "This short guide shows how serialize a Pipeline into a file and later on load it\n", + "to make predictions.\n", + "\n", + "Note that some steps are not explained for simplicity. Full details\n", + "about them can be found in the previous parts of the tutorial.\n", + "\n", + "We will:\n", + "\n", + "1. Load and fit a pipeline to a dataset\n", + "2. Save the pipeline to a file.\n", + "3. Load the pipeline as a new object.\n", + "4. Make predictions using the new pipeline object." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fit the pipeline\n", + "\n", + "The first step will be to load and fit the pipeline to the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from mlprimitives.datasets import load_dataset\n", + "\n", + "dataset = load_dataset('census')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = dataset.get_splits(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "pipeline = MLPipeline('single_table.classification.categorical_encoder.xgboost')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save the Pipeline\n", + "\n", + "Once the pipeline is fit and ready to make predictions we can store it in a file.\n", + "We will do so using [pickle](https://docs.python.org/3/library/pickle.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "with open('pipeline.pkl', 'wb') as f:\n", + " pickle.dump(pipeline, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the Pipeline\n", + "\n", + "The saved pipeline can then be moved to another system where we can load it back to\n", + "memory using pickle again." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "with open('pipeline.pkl', 'rb') as f:\n", + " loaded_pipeline = pickle.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**IMPORTANT**: All the dependencies need to also be installed in the system that is loading the pipeline. This includes **MLBlocks** and **MLPrimitives** or any other libraries required by the pipeline primitives." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Make Predictions\n", + "\n", + "Once the pipeline is loaded it is ready to make predictions again" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "pred = pipeline.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' >50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred[0:5]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/tutorials/5. Tuning a Pipeline.ipynb b/examples/tutorials/5. Tuning a Pipeline.ipynb new file mode 100644 index 00000000..8dbc4366 --- /dev/null +++ b/examples/tutorials/5. Tuning a Pipeline.ipynb @@ -0,0 +1,463 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tuning a Pipeline\n", + "\n", + "This short guide shows how tune a Pipeline using a [BTB](https://github.com/HDI-Project/BTB) Tuner.\n", + "\n", + "Note that some steps are not explained for simplicity. Full details\n", + "about them can be found in the previous parts of the tutorial.\n", + "\n", + "Here we will:\n", + "1. Load a dataset and a pipeline\n", + "2. Explore the pipeline tunable hyperparameters\n", + "3. Write a scoring function\n", + "4. Build a BTB Tunable and BTB Tuner.\n", + "5. Write a tuning loop" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load dataset and the pipeline\n", + "\n", + "The first step will be to load the dataset that we were using in previous tutorials." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from mlprimitives.datasets import load_dataset\n", + "\n", + "dataset = load_dataset('census')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And load a suitable pipeline.\n", + "\n", + "Note how in this case we are using the variable name `template` instead of `pipeline`,\n", + "because this will only be used as a template for the pipelines that we will create\n", + "and evaluate during the later tuning loop." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "template = MLPipeline('single_table.classification.categorical_encoder.xgboost')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore the pipeline tunable hyperparameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have loaded the pipeline, we can now extract the hyperparameters that we will tune\n", + "by calling the `get_tunable_hyperparameters` method.\n", + "\n", + "In this case we will call it using `flat=True` to obtain the hyperparameters in a format\n", + "that is compatible with BTB." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "tunable_hyperparameters = template.get_tunable_hyperparameters(flat=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): {'type': 'int', 'default': 0, 'range': [0, 100]},\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): {'type': 'str',\n", + " 'default': 'mean',\n", + " 'values': ['mean', 'median', 'most_frequent', 'constant']},\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): {'type': 'int',\n", + " 'default': 100,\n", + " 'range': [10, 1000]},\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): {'type': 'int',\n", + " 'default': 3,\n", + " 'range': [3, 10]},\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): {'type': 'float',\n", + " 'default': 0.1,\n", + " 'range': [0, 1]},\n", + " ('xgboost.XGBClassifier#1', 'gamma'): {'type': 'float',\n", + " 'default': 0,\n", + " 'range': [0, 1]},\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): {'type': 'int',\n", + " 'default': 1,\n", + " 'range': [1, 10]}}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tunable_hyperparameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Write a scoring function\n", + "\n", + "To tune the pipeline we will need to evaluate its performance multiple times with different hyperparameters.\n", + "\n", + "For this reason, we will start by writing a scoring function that will expect only one\n", + "input, the hyperparameters dictionary, and evaluate the performance of the pipeline using them.\n", + "\n", + "In this case, the evaluation will be done using 5-fold cross validation based on the `get_splits`\n", + "method from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "def cross_validate(hyperparameters=None):\n", + " scores = []\n", + " for X_train, X_test, y_train, y_test in dataset.get_splits(5):\n", + " pipeline = MLPipeline(template.to_dict()) # Make a copy of the template\n", + " if hyperparameters:\n", + " pipeline.set_hyperparameters(hyperparameters)\n", + "\n", + " pipeline.fit(X_train, y_train)\n", + " y_pred = pipeline.predict(X_test)\n", + " \n", + " scores.append(dataset.score(y_test, y_pred))\n", + " \n", + " return np.mean(scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By calling this function without any arguments we will obtain the score obtained\n", + "with the default hyperparameters." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8639171383183359" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "default_score = cross_validate()\n", + "default_score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Optionally, we can certify that by passing a hyperparameters dictionary the new hyperparameters\n", + "will be used, resulting on a different score." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8686773872402614" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hyperparameters = {\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 4\n", + "}\n", + "cross_validate(hyperparameters)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a BTB Tunable\n", + "\n", + "The next step is to create the BTB Tunable instance that will be tuned by the BTB Tuner.\n", + "\n", + "For this we will use its `from_dict` method, passing our hyperparameters dict." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from btb.tuning import Tunable\n", + "\n", + "tunable = Tunable.from_dict(tunable_hyperparameters)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create the BTB Tuner\n", + "\n", + "After creating the Tunable, we need to create a Tuner to tune it.\n", + "\n", + "In this case we will use the GPTuner, a Meta-model based tuner that uses a Gaussian Process Regressor\n", + "for the optimization." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from btb.tuning import GPTuner\n", + "\n", + "tuner = GPTuner(tunable)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Optionally, since we already know the score obtained by the default arguments and\n", + "these have a high probability of being already decent, we will inform the tuner\n", + "about their performance.\n", + "\n", + "In order to obtain the default hyperparameters used before we can either call\n", + "the template `get_hyperparameters(flat=True)` method, the `tunable.get_defaults()`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 0,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "defaults = tunable.get_defaults()\n", + "defaults" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "tuner.record(defaults, default_score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Start the Tuning loop\n", + "\n", + "Once we have the tuner ready we can the tuning loop.\n", + "\n", + "During this loop we will:\n", + "\n", + "1. Ask the tuner for a new hyperparameter proposal\n", + "2. Run the `cross_validate` function to evaluate these hyperparameters\n", + "3. Record the obtained score back to the tuner.\n", + "4. If the obtained score is better than the previous one, store the proposal." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "scoring pipeline 1\n", + "scoring pipeline 2\n", + "New best found: 0.8722706212975673\n", + "scoring pipeline 3\n", + "scoring pipeline 4\n", + "scoring pipeline 5\n", + "scoring pipeline 6\n", + "scoring pipeline 7\n", + "scoring pipeline 8\n", + "scoring pipeline 9\n", + "scoring pipeline 10\n" + ] + } + ], + "source": [ + "best_score = default_score\n", + "best_proposal = defaults\n", + "\n", + "for iteration in range(10):\n", + " print(\"scoring pipeline {}\".format(iteration + 1))\n", + " \n", + " proposal = tuner.propose()\n", + " score = cross_validate(proposal)\n", + " \n", + " tuner.record(proposal, score)\n", + " \n", + " if score > best_score:\n", + " print(\"New best found: {}\".format(score))\n", + " best_score = score\n", + " best_proposal = proposal" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the loop has finished, the best proposal will be stored in the `best_proposal` variable,\n", + "which can be used to generate a new pipeline instance." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 40,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 119,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 4,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1971742459927317,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.22575517380871246,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 4}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "best_proposal" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "best_pipeline = MLPipeline(template.to_dict())" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "best_pipeline.set_hyperparameters(best_proposal)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "best_pipeline.fit(dataset.data, dataset.target)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb new file mode 100644 index 00000000..a1f0c0f4 --- /dev/null +++ b/examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb @@ -0,0 +1,895 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Selecting and Tuning pipelines\n", + "\n", + "This guide shows you how to search for multiple pipelines for your problem\n", + "and later on use a [BTBSession](https://hdi-project.github.io/BTB/api/btb.session.html#btb.session.BTBSession)\n", + "to select and tune the best one.\n", + "\n", + "Note that some steps are not explained for simplicity. Full details\n", + "about them can be found in the previous parts of the tutorial.\n", + "\n", + "Here we will:\n", + "\n", + "1. Load a dataset\n", + "2. Search and load suitable templates\n", + "3. Write a scoring function\n", + "4. Build a BTBSession for our templates\n", + "5. Run the session to find the best pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the Dataset\n", + "\n", + "The first step will be to load the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from mlprimitives.datasets import load_dataset\n", + "\n", + "dataset = load_dataset('census')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adult Census dataset.\n", + "\n", + " Predict whether income exceeds $50K/yr based on census data. Also known as \"Adult\" dataset.\n", + "\n", + " Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean\n", + " records was extracted using the following conditions: ((AAGE>16) && (AGI>100) &&\n", + " (AFNLWGT>1)&& (HRSWK>0))\n", + "\n", + " Prediction task is to determine whether a person makes over 50K a year.\n", + "\n", + " source: \"UCI\n", + " sourceURI: \"https://archive.ics.uci.edu/ml/datasets/census+income/"\n", + " \n", + "Data Modality: single_table\n", + "Task Type: classification\n", + "Task Subtype: binary\n", + "Data shape: (32561, 14)\n", + "Target shape: (32561,)\n", + "Metric: accuracy_score\n", + "Extras: \n" + ] + } + ], + "source": [ + "dataset.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Find and load suitable Templates" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will be using the `mlblocks.discovery.find_pipelines` function to search\n", + "for compatible pipelines.\n", + "\n", + "In this case, we will be looking for `single_table/classification` pipelines." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks.discovery import find_pipelines\n", + "\n", + "filters = {\n", + " 'metadata.data_modality': 'single_table',\n", + " 'metadata.task_type': 'classification'\n", + "}\n", + "templates = find_pipelines(filters=filters)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['single_table.classification.categorical_encoder.logit',\n", + " 'single_table.classification.categorical_encoder.random_forest',\n", + " 'single_table.classification.categorical_encoder.xgboost',\n", + " 'single_table.classification.mlprimitives.logit',\n", + " 'single_table.classification.mlprimitives.random_forest',\n", + " 'single_table.classification.mlprimitives.xgboost',\n", + " 'single_table.classification.mlprimitives_text.xgboost']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "templates" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we will create a dictionary with MLPipeline instances that will be used as tempaltes for our tuning." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "templates_dict = {\n", + " template: MLPipeline(template)\n", + " for template in templates\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "templates_dict['single_table.classification.mlprimitives.xgboost']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a scoring function\n", + "\n", + "In order to use a `BTBSession` we will need a function that is able to score a proposal,\n", + "which will always be a pair of template name and proposed hyperparameters.\n", + "\n", + "In this case, the evaluation will be done using 5-fold cross validation over our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "def cross_validate(template_name, hyperparameters=None):\n", + " template = templates_dict[template_name]\n", + " scores = []\n", + " for X_train, X_test, y_train, y_test in dataset.get_splits(5):\n", + " pipeline = MLPipeline(template.to_dict()) # Make a copy of the template\n", + " if hyperparameters:\n", + " pipeline.set_hyperparameters(hyperparameters)\n", + "\n", + " pipeline.fit(X_train, y_train)\n", + " y_pred = pipeline.predict(X_test)\n", + " \n", + " scores.append(dataset.score(y_test, y_pred))\n", + " \n", + " return np.mean(scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup the BTBSession" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will create another dictionary with the tunable hyperparameters of each template.\n", + "This will be used by the BTBSession to know how to tune each template." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "tunables = {\n", + " name: template.get_tunable_hyperparameters(flat=True)\n", + " for name, template in templates_dict.items()\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): {'type': 'int', 'default': 0, 'range': [0, 100]},\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'lowercase'): {'type': 'bool', 'default': True},\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'binary'): {'type': 'bool', 'default': True},\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'max_features'): {'type': 'int', 'default': 1000, 'range': [1, 10000]},\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): {'type': 'str',\n", + " 'default': 'mean',\n", + " 'values': ['mean', 'median', 'most_frequent', 'constant']},\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): {'type': 'int',\n", + " 'default': 100,\n", + " 'range': [10, 1000]},\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): {'type': 'int',\n", + " 'default': 3,\n", + " 'range': [3, 10]},\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): {'type': 'float',\n", + " 'default': 0.1,\n", + " 'range': [0, 1]},\n", + " ('xgboost.XGBClassifier#1', 'gamma'): {'type': 'float',\n", + " 'default': 0,\n", + " 'range': [0, 1]},\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): {'type': 'int',\n", + " 'default': 1,\n", + " 'range': [1, 10]}}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tunables['single_table.classification.mlprimitives.xgboost']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then create a `BTBSession` instance passing them and the `cross_validate` function.\n", + "\n", + "We will also be setting it in `verbose` mode, so we can have a better insight on what is going on." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from btb.session import BTBSession\n", + "\n", + "session = BTBSession(tunables, cross_validate, verbose=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Run the session" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After everything is set up, we can start running the tuning session passing it\n", + "the number of iterations that we want to perform." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fe9bb1cfdb2f48d4b6c8614ae1d357a1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-01-23 20:16:01,059 - INFO - session - Creating Tunable instance from dict.\n", + "2020-01-23 20:16:01,060 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.logit\n", + "2020-01-23 20:16:03,274 - INFO - session - New optimal found: single_table.classification.categorical_encoder.logit - 0.7975185708718643\n", + "2020-01-23 20:16:03,284 - INFO - session - Creating Tunable instance from dict.\n", + "2020-01-23 20:16:03,285 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.random_forest\n", + "2020-01-23 20:16:05,584 - INFO - session - Creating Tunable instance from dict.\n", + "2020-01-23 20:16:05,585 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.xgboost\n", + "2020-01-23 20:16:10,613 - INFO - session - New optimal found: single_table.classification.categorical_encoder.xgboost - 0.8639171383183359\n", + "2020-01-23 20:16:10,617 - INFO - session - Creating Tunable instance from dict.\n", + "2020-01-23 20:16:10,618 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.logit\n", + "2020-01-23 20:16:13,090 - INFO - session - Creating Tunable instance from dict.\n", + "2020-01-23 20:16:13,093 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.random_forest\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "{'id': '51a54054874dd7a83ff0e785ffdfee3b',\n", + " 'name': 'single_table.classification.categorical_encoder.xgboost',\n", + " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 0,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", + " 'score': 0.8639171383183359}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "session.run(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "During this loop, the BTBSession will build pipelines based on our templates and evaluate them\n", + "using our scoring function." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Evaluate results\n", + "\n", + "When the session funishes running it will return a the best proposal available and the\n", + "obtained score.\n", + "\n", + "These results are also available as the `best_proposal` attribute from the btb session object." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': '51a54054874dd7a83ff0e785ffdfee3b',\n", + " 'name': 'single_table.classification.categorical_encoder.xgboost',\n", + " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 0,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", + " 'score': 0.8639171383183359}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "session.best_proposal" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Continue Running" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we feel that the score can still be improved and want to keep searching, we can simply run the session again which will continue tuning over the previous results." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a76ce44e1173496e99baaf7ee39a3df7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-01-23 20:17:59,163 - INFO - session - Creating Tunable instance from dict.\n", + "2020-01-23 20:17:59,163 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.xgboost\n", + "2020-01-23 20:18:04,640 - INFO - session - Creating Tunable instance from dict.\n", + "2020-01-23 20:18:04,640 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives_text.xgboost\n", + "2020-01-23 20:18:04,779 - ERROR - mlpipeline - Exception caught producing MLBlock mlprimitives.custom.text.TextCleaner#1\n", + "Traceback (most recent call last):\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n", + " return self._engine.get_loc(key)\n", + " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + "KeyError: 'text'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 635, in _produce_block\n", + " block_outputs = block.produce(**produce_args)\n", + " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 322, in produce\n", + " return getattr(self.instance, self.produce_method)(**produce_kwargs)\n", + " File \"/home/xals/Projects/MIT/MLPrimitives/mlprimitives/custom/text.py\", line 111, in produce\n", + " texts = X[self.column]\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n", + " indexer = self.columns.get_loc(key)\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n", + " return self._engine.get_loc(self._maybe_cast_indexer(key))\n", + " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + "KeyError: 'text'\n", + "2020-01-23 20:18:04,799 - ERROR - session - Proposal 7 - single_table.classification.mlprimitives_text.xgboost crashed with the following configuration: ('mlprimitives.custom.text.TextCleaner#1', 'lower'): True\n", + "('mlprimitives.custom.text.TextCleaner#1', 'accents'): True\n", + "('mlprimitives.custom.text.TextCleaner#1', 'stopwords'): True\n", + "('mlprimitives.custom.text.TextCleaner#1', 'non_alpha'): True\n", + "('mlprimitives.custom.text.TextCleaner#1', 'single_chars'): True\n", + "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'lowercase'): True\n", + "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'binary'): True\n", + "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'max_features'): 1000\n", + "('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'n_estimators'): 10\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'criterion'): gini\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'max_features'): None\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'max_depth'): 1\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_split'): 2\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_leaf'): 1\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'min_weight_fraction_leaf'): 0.0\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'max_leaf_nodes'): 2\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'min_impurity_decrease'): 0.0\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False\n", + "Traceback (most recent call last):\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n", + " return self._engine.get_loc(key)\n", + " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + "KeyError: 'text'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n", + " score = self.scorer(tunable_name, config)\n", + " File \"\", line 11, in cross_validate\n", + " pipeline.fit(X_train, y_train)\n", + " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 722, in fit\n", + " self._produce_block(block, block_name, context, output_variables, outputs)\n", + " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 635, in _produce_block\n", + " block_outputs = block.produce(**produce_args)\n", + " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 322, in produce\n", + " return getattr(self.instance, self.produce_method)(**produce_kwargs)\n", + " File \"/home/xals/Projects/MIT/MLPrimitives/mlprimitives/custom/text.py\", line 111, in produce\n", + " texts = X[self.column]\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n", + " indexer = self.columns.get_loc(key)\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n", + " return self._engine.get_loc(self._maybe_cast_indexer(key))\n", + " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + "KeyError: 'text'\n", + "2020-01-23 20:18:04,801 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.mlprimitives_text.xgboost\n", + "2020-01-23 20:18:04,803 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n", + "2020-01-23 20:18:22,026 - INFO - session - New optimal found: single_table.classification.categorical_encoder.xgboost - 0.8687079630193402\n", + "2020-01-23 20:18:22,031 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n", + "2020-01-23 20:19:13,106 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.logit\n", + "2020-01-23 20:19:13,334 - ERROR - mlpipeline - Exception caught fitting MLBlock sklearn.linear_model.LogisticRegression#1\n", + "Traceback (most recent call last):\n", + " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n", + " block.fit(**fit_args)\n", + " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n", + " getattr(self.instance, self.fit_method)(**fit_kwargs)\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n", + " solver = _check_solver(self.solver, self.penalty, self.dual)\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n", + " \"got %s penalty.\" % (solver, penalty))\n", + "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n", + "2020-01-23 20:19:13,339 - ERROR - session - Proposal 10 - single_table.classification.categorical_encoder.logit crashed with the following configuration: ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 29\n", + "('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n", + "('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): False\n", + "('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 71156\n", + "('sklearn.linear_model.LogisticRegression#1', 'solver'): newton-cg\n", + "('sklearn.linear_model.LogisticRegression#1', 'penalty'): l1\n", + "('sklearn.linear_model.LogisticRegression#1', 'C'): 40.699406362214916\n", + "('sklearn.linear_model.LogisticRegression#1', 'multi_class'): multinomial\n", + "('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 933.5409791334005\n", + "('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.0017748534037681438\n", + "('sklearn.linear_model.LogisticRegression#1', 'dual'): True\n", + "Traceback (most recent call last):\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n", + " score = self.scorer(tunable_name, config)\n", + " File \"\", line 11, in cross_validate\n", + " pipeline.fit(X_train, y_train)\n", + " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 719, in fit\n", + " self._fit_block(block, block_name, context)\n", + " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n", + " block.fit(**fit_args)\n", + " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n", + " getattr(self.instance, self.fit_method)(**fit_kwargs)\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n", + " solver = _check_solver(self.solver, self.penalty, self.dual)\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n", + " \"got %s penalty.\" % (solver, penalty))\n", + "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-01-23 20:19:13,340 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.categorical_encoder.logit\n", + "2020-01-23 20:19:13,343 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.logit\n", + "2020-01-23 20:19:26,076 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n", + "2020-01-23 20:19:31,573 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n", + "2020-01-23 20:19:34,763 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n", + "2020-01-23 20:20:15,775 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n", + "2020-01-23 20:21:49,655 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.logit\n", + "2020-01-23 20:21:49,946 - ERROR - mlpipeline - Exception caught fitting MLBlock sklearn.linear_model.LogisticRegression#1\n", + "Traceback (most recent call last):\n", + " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n", + " block.fit(**fit_args)\n", + " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n", + " getattr(self.instance, self.fit_method)(**fit_kwargs)\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n", + " solver = _check_solver(self.solver, self.penalty, self.dual)\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n", + " \"got %s penalty.\" % (solver, penalty))\n", + "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n", + "2020-01-23 20:21:49,948 - ERROR - session - Proposal 16 - single_table.classification.mlprimitives.logit crashed with the following configuration: ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 97\n", + "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'lowercase'): True\n", + "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'binary'): True\n", + "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'max_features'): 4707\n", + "('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n", + "('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): True\n", + "('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 26014\n", + "('sklearn.linear_model.LogisticRegression#1', 'solver'): newton-cg\n", + "('sklearn.linear_model.LogisticRegression#1', 'penalty'): l1\n", + "('sklearn.linear_model.LogisticRegression#1', 'C'): 34.878827238511434\n", + "('sklearn.linear_model.LogisticRegression#1', 'multi_class'): multinomial\n", + "('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 406.1952335959628\n", + "('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.008653762646621075\n", + "('sklearn.linear_model.LogisticRegression#1', 'dual'): True\n", + "Traceback (most recent call last):\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n", + " score = self.scorer(tunable_name, config)\n", + " File \"\", line 11, in cross_validate\n", + " pipeline.fit(X_train, y_train)\n", + " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 719, in fit\n", + " self._fit_block(block, block_name, context)\n", + " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n", + " block.fit(**fit_args)\n", + " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n", + " getattr(self.instance, self.fit_method)(**fit_kwargs)\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n", + " solver = _check_solver(self.solver, self.penalty, self.dual)\n", + " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n", + " \"got %s penalty.\" % (solver, penalty))\n", + "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n", + "2020-01-23 20:21:49,951 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.mlprimitives.logit\n", + "2020-01-23 20:21:49,953 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n", + "2020-01-23 20:22:23,153 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n", + "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n", + " warn(\"Some inputs do not have OOB scores. \"\n", + "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n", + " predictions[k].sum(axis=1)[:, np.newaxis])\n", + "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n", + " warn(\"Some inputs do not have OOB scores. \"\n", + "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n", + " predictions[k].sum(axis=1)[:, np.newaxis])\n", + "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n", + " warn(\"Some inputs do not have OOB scores. \"\n", + "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n", + " predictions[k].sum(axis=1)[:, np.newaxis])\n", + "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n", + " warn(\"Some inputs do not have OOB scores. \"\n", + "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n", + " predictions[k].sum(axis=1)[:, np.newaxis])\n", + "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n", + " warn(\"Some inputs do not have OOB scores. \"\n", + "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n", + " predictions[k].sum(axis=1)[:, np.newaxis])\n", + "2020-01-23 20:22:24,832 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n", + "2020-01-23 20:22:46,026 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n", + "2020-01-23 20:22:53,670 - INFO - session - New optimal found: single_table.classification.mlprimitives.xgboost - 0.8739290413691612\n", + "2020-01-23 20:22:53,677 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n", + "2020-01-23 20:22:55,126 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n", + "2020-01-23 20:23:10,345 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n", + "2020-01-23 20:23:15,497 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n", + "2020-01-23 20:23:28,746 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "{'id': 'd9854a57d48100da0f3584dc4490301f',\n", + " 'name': 'single_table.classification.mlprimitives.xgboost',\n", + " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 22,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'lowercase'): True,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'binary'): True,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'max_features'): 3863,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 193,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.29839198565184866,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.19826736959824165,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 4},\n", + " 'score': 0.8739290413691612}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "session.run(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**NOTE**: If you look at the logs you will notice how the BTBSession captures the errors that finds\n", + "while executing the pipelines and automatically discards the failing tempaltes to be able to continue\n", + "the tuning session without wasting time on them.\n", + "\n", + "The number of errors that we want to wait before discarding a template can be changed passing the\n", + "`max_errors` argument to the `BTBSession` when it is build.\n", + "\n", + "Isn't it cool?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build the best pipeline\n", + "\n", + "Once we are satisfied with the results, we can then build an instance of the best pipeline\n", + "by reading the `best_proposal` attribute from the `session`." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'd9854a57d48100da0f3584dc4490301f',\n", + " 'name': 'single_table.classification.mlprimitives.xgboost',\n", + " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 22,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'lowercase'): True,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'binary'): True,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'max_features'): 3863,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 193,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.29839198565184866,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.19826736959824165,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 4},\n", + " 'score': 0.8739290413691612}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "best_proposal = session.best_proposal\n", + "best_proposal" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "template = templates_dict[best_proposal['name']]\n", + "\n", + "pipeline = MLPipeline(template.to_dict())\n", + "pipeline.set_hyperparameters(best_proposal['config'])\n", + "\n", + "pipeline.fit(dataset.data, dataset.target)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore other results\n", + "\n", + "Optionally, if we are interested in exploring the results of the previous proposals we can access them\n", + "in the `trials` attribute of the `session` object." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'id': '9dd9a11254f46b11ad42a12692b4965e',\n", + " 'name': 'single_table.classification.categorical_encoder.logit',\n", + " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 0,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): True,\n", + " ('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 100,\n", + " ('sklearn.linear_model.LogisticRegression#1', 'solver'): 'liblinear',\n", + " ('sklearn.linear_model.LogisticRegression#1', 'penalty'): 'l2',\n", + " ('sklearn.linear_model.LogisticRegression#1', 'C'): 1.0,\n", + " ('sklearn.linear_model.LogisticRegression#1', 'multi_class'): 'ovr',\n", + " ('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 1.0,\n", + " ('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.0001,\n", + " ('sklearn.linear_model.LogisticRegression#1', 'dual'): False},\n", + " 'score': 0.7975185708718643},\n", + " {'id': 'f7ef0814341cee4f05280077b9b3de9c',\n", + " 'name': 'single_table.classification.categorical_encoder.random_forest',\n", + " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 0,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('sklearn.ensemble.RandomForestClassifier#1', 'n_estimators'): 10,\n", + " ('sklearn.ensemble.RandomForestClassifier#1', 'criterion'): 'gini',\n", + " ('sklearn.ensemble.RandomForestClassifier#1', 'max_features'): None,\n", + " ('sklearn.ensemble.RandomForestClassifier#1', 'max_depth'): 1,\n", + " ('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_split'): 2,\n", + " ('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_leaf'): 1,\n", + " ('sklearn.ensemble.RandomForestClassifier#1',\n", + " 'min_weight_fraction_leaf'): 0.0,\n", + " ('sklearn.ensemble.RandomForestClassifier#1', 'max_leaf_nodes'): 2,\n", + " ('sklearn.ensemble.RandomForestClassifier#1', 'min_impurity_decrease'): 0.0,\n", + " ('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True,\n", + " ('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False},\n", + " 'score': 0.7591904454179904}]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(session.proposals.values())[0:2]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/setup.py b/setup.py index 1e8ef2ad..6045c574 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,12 @@ ] +examples_require = [ + 'mlprimitives>=0.2.4.dev0', + 'jupyter==1.0.0' +] + + tests_require = [ 'pytest>=3.4.2', 'pytest-cov>=2.6.0', @@ -88,6 +94,7 @@ extras_require={ 'dev': development_requires + tests_require, 'test': tests_require, + 'examples': examples_require, }, include_package_data=True, install_requires=install_requires, From c2f862b55ec52e6b7c431fe741bd83f7366b6a09 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 16 Feb 2020 15:20:27 -0500 Subject: [PATCH 084/160] Update tutorials --- .gitignore | 1 + examples/README.md | 57 ++ .../2. Finding and Loading a Pipeline.ipynb | 6 +- .... Setting MLPipeline Hyperparameters.ipynb | 9 +- ...Saving and Loading a Pipeline-Copy1.ipynb} | 9 +- ...ial execution and pipeline debugging.ipynb | 712 ++++++++++++++++++ .../6. Flexible outputs specification.ipynb | 517 +++++++++++++ ...eline.ipynb => 7. Tuning a Pipeline.ipynb} | 0 ...r the best pipeline with BTBSession.ipynb} | 0 9 files changed, 1306 insertions(+), 5 deletions(-) create mode 100644 examples/README.md rename examples/tutorials/{4. Saving and Loading a Pipeline.ipynb => 4. Saving and Loading a Pipeline-Copy1.ipynb} (91%) create mode 100644 examples/tutorials/5. Partial execution and pipeline debugging.ipynb create mode 100644 examples/tutorials/6. Flexible outputs specification.ipynb rename examples/tutorials/{5. Tuning a Pipeline.ipynb => 7. Tuning a Pipeline.ipynb} (100%) rename examples/tutorials/{6. Searching for the best pipeline with BTBSession.ipynb => 8. Searching for the best pipeline with BTBSession.ipynb} (100%) diff --git a/.gitignore b/.gitignore index 011ff452..037d677e 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,4 @@ ENV/ .*.swp mlblocks/data +examples/tutorials/pipeline.pkl diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 00000000..12131c95 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,57 @@ +# MLBlocks Examples + +This folder contains Python code, Jupyter Notebooks and JSON examples to demonstrate MLBlocks +functionaliry. + +Within this folder you will find: + +* `examples.py`: Simple Python code examples of a class and a function based primitive implementation. +* `primitives`: Example primitive JSONs to demonstrate different MLBlocks functionalities. +* `pipelines`: Example pipeline JSONs to demonstrate different MLBlocks functionalities. +* `tutorials`: Collection of Jupyter Notebooks to show the usage of different MLBlocks functionalities. + + +# Requirements + +In order to run the examples contained in this folder you should have [pip installed on your system +](https://pip.pypa.io/en/stable/installing/). + +Optionally, also install and activate a [virtualenv](https://virtualenv.pypa.io/en/latest/) to +run them in an isolated environment. + +# Usage + +In order to run these tutorials on your computer, please follow these steps: + +1. Clone this github repository: + +```bash +git clone git@github.com:HDI-Project/MLBlocks.git +``` + +2. (Optional) Create a virtualenv to execute the examples in an environment isolated from the +rest of your computer: + +```bash +pip install virtualenv +virtualenv -p $(which python3.6) mlblocks-venv +soucre mlblocks-venv/bin/activate +``` + +3. Enter the repository and install the dependencies + +```bash +cd MLBlocks +make install-examples +``` + +This will install [MLBLocks](https://github.com/HDI-Project/MLBlocks.git) and also [MLPrimitives]( +https://github.com/HDI-Project/MLPrimitives.git) and [Jupyter](https://jupyter.org/). + +4. Enter the `examples` folder and start a Jupyter Notebook: + +```bash +jupyter notebook +``` + +5. Point your browser at the link shown in your console and run the examples from the `examples/tutorials` folder. diff --git a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb index a94c48bc..8df76259 100644 --- a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb +++ b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -63,7 +63,7 @@ " 'image.classification.resnet50.xgboost']" ] }, - "execution_count": 8, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -89,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb index 29f60a8f..0914e806 100644 --- a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb +++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb @@ -51,7 +51,14 @@ "source": [ "from mlblocks import MLPipeline\n", "\n", - "pipeline = MLPipeline('single_table.classification.categorical_encoder.xgboost')" + "primitives = [\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'xgboost.XGBClassifier',\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder'\n", + "]\n", + "pipeline = MLPipeline(primitives)" ] }, { diff --git a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb b/examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb similarity index 91% rename from examples/tutorials/4. Saving and Loading a Pipeline.ipynb rename to examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb index 193daaf3..f8a0a5b3 100644 --- a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb +++ b/examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb @@ -57,7 +57,14 @@ "source": [ "from mlblocks import MLPipeline\n", "\n", - "pipeline = MLPipeline('single_table.classification.categorical_encoder.xgboost')" + "primitives = [\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'xgboost.XGBClassifier',\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder'\n", + "]\n", + "pipeline = MLPipeline(primitives)" ] }, { diff --git a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb new file mode 100644 index 00000000..2e21c85b --- /dev/null +++ b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb @@ -0,0 +1,712 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Partial execution and pipeline debugging\n", + "\n", + "In this guide we will show you how to execute a pipeline partially in order to\n", + "debug its internal behavior or optimize tuning processes.\n", + "\n", + "Note that some steps are not explained for simplicity. Full details\n", + "about them can be found in the previous parts of the tutorial.\n", + "\n", + "We will:\n", + "\n", + "1. Load a pipeline and a dataset\n", + "2. Explore the context after fitting the first primitive.\n", + "3. Fit the rest of the pipeline\n", + "4. Partial execution during Predict\n", + "5. Rerunning the last steps" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load a pipeline and a datset\n", + "\n", + "The first step will be to load the Census dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from mlprimitives.datasets import load_dataset\n", + "\n", + "dataset = load_dataset('census')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = dataset.get_splits(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a reminder, we have a loot at what the `X` and `y` variables that we will be passing to our\n", + "pipeline look like." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`X` is a `pandas.DataFrame` that conatins the demographics data of the subjects:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-country
2829125Private193379Assoc-acdm12Never-marriedCraft-repairNot-in-familyWhiteMale0045United-States
2863655Federal-gov176904HS-grad9Married-civ-spouseExec-managerialHusbandWhiteMale0040United-States
791930Private284395HS-grad9Married-civ-spouseCraft-repairHusbandWhiteMale0050United-States
2486117Private23934610th6Never-marriedOther-serviceOwn-childWhiteMale0018United-States
2348051Private57698HS-grad9Married-spouse-absentOther-serviceUnmarriedWhiteFemale0040United-States
\n", + "
" + ], + "text/plain": [ + " age workclass fnlwgt education education-num \\\n", + "28291 25 Private 193379 Assoc-acdm 12 \n", + "28636 55 Federal-gov 176904 HS-grad 9 \n", + "7919 30 Private 284395 HS-grad 9 \n", + "24861 17 Private 239346 10th 6 \n", + "23480 51 Private 57698 HS-grad 9 \n", + "\n", + " marital-status occupation relationship race \\\n", + "28291 Never-married Craft-repair Not-in-family White \n", + "28636 Married-civ-spouse Exec-managerial Husband White \n", + "7919 Married-civ-spouse Craft-repair Husband White \n", + "24861 Never-married Other-service Own-child White \n", + "23480 Married-spouse-absent Other-service Unmarried White \n", + "\n", + " sex capital-gain capital-loss hours-per-week native-country \n", + "28291 Male 0 0 45 United-States \n", + "28636 Male 0 0 40 United-States \n", + "7919 Male 0 0 50 United-States \n", + "24861 Male 0 0 18 United-States \n", + "23480 Female 0 0 40 United-States " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And `y` is a `numpy.ndarray` that contains the label that indicates whether the subject has a salary\n", + "above or under 50K." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K'], dtype=object)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train[0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we build a suitable pipeline for our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "primitives = [\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'xgboost.XGBClassifier',\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder'\n", + "]\n", + "pipeline = MLPipeline(primitives)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore the context after fitting the first primitive\n", + "\n", + "Once we know what primitives we are executing, we will execute only the first one\n", + "and see how the context changed after it.\n", + "\n", + "For this, we will execute the `fit` method passing the index of the last pipeline\n", + "step that we want to execute before returning. In this case, `0`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "fit_context = pipeline.fit(X_train, y_train, output_=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**NOTE**: Optionally, instead of passing the pipeline step index, we could pass the complete name\n", + "of the step, including the counter number: `mlprimitives.custom.preprocessing.ClassEncoder#1`" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "output_step = 'mlprimitives.custom.preprocessing.ClassEncoder#1'\n", + "fit_context = pipeline.fit(X_train, y_train, output_=output_step)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In both cases, the output will be a dictionary containing all the context variables after\n", + "fitting and producing the first pipeline step." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['X', 'y', 'classes'])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit_context.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice how we find the `X` and `y` variables that we passed to the `fit` method, but also a new `classes` variable\n", + "that was generated by the `mlprimitives.custom.preprocessing.ClassEncoder` primitive of the first pipeline step.\n", + "\n", + "This `classes` variable contains the list of unique values that the variable `y` originally had." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' <=50K', ' >50K'], dtype=object)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit_context['classes']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Also notice that the variable `y` has been transformed by the primitive into an array of\n", + "integer values." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit_context['y'][0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fit the rest of the pipeline\n", + "\n", + "After exploring the context generated by the first pipeline step we will now run\n", + "a few steps more, up to the point where the feature matrix is ready for the XGBClassifier.\n", + "\n", + "For this we will run the `fit` method again passing back the context that we just obtained\n", + "as well as the `start_` argument indicating that we need to start fitting on the second\n", + "step of the pipeline, skipping the first one, and the `output_` argument indicating that\n", + "we want to stop on the third step, right before the `XGBClassifier` primitive.\n", + "\n", + "Note how the context is passed using a double asterisk `**` syntax, but that individual\n", + "variables could also be passed as keyword arguments." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "fit_context = pipeline.fit(start_=1, output_=2, **fit_context)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now the context still contains the same variables as before" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['classes', 'X', 'y'])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit_context.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But the variable `X` has been completely modified by the CategoricalEncoder and Imputer\n", + "primitives, so now it is a 100% numerical `numpy.ndarray` ready for the `XGBClassifier`" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([2.50000e+01, 1.93379e+05, 1.20000e+01, 0.00000e+00, 0.00000e+00,\n", + " 4.50000e+01, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n", + " 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n", + " 0.00000e+00, 0.00000e+00, 0.00000e+00])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit_context['X'][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we can pass the new context to the rest of the pipeline to finish fitting it.\n", + "\n", + "Note how, just like the `output_`, the `start_` step can also be indicated using the step\n", + "name instead of the index." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.fit(start_='xgboost.XGBClassifier#1', **fit_context)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Partial execution during Predict\n", + "\n", + "Just like in the `fit` stage, the `predict` method also accepts a partial output specification." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "predict_context = pipeline.predict(X_test, output_=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['X', 'y'])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predict_context.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As well as a partial execution after a specific pipeline step" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "predictions = pipeline.predict(start_=3, **predict_context)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' >50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions[0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Rerunning the last steps\n", + "\n", + "One of the key advantages of the partial execution that we just explored is the\n", + "possibility to re-fit and make new predictions multiple times with different\n", + "hyperparameter values for the last half of the pipeline without the need to\n", + "re-fit and re-execute the first half.\n", + "\n", + "This has the potential to greatly accelerate tuning processes in cases where there\n", + "are no tunable hyperparameters (or there are but we do not want to tune them) in\n", + "the preprocessing steps but the execution times are long.\n", + "\n", + "As an example, let's evaluate the performance of the pipeline and try to optimize\n", + "it by changing some hyperparameters of the classifier." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8602137329566393" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.score(y_test, predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "hyperparameters = {\n", + " 'xgboost.XGBClassifier#1': {\n", + " 'learning_rate': 0.5\n", + " }\n", + "}\n", + "pipeline.set_hyperparameters(hyperparameters)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.fit(start_=3, **fit_context)\n", + "predictions = pipeline.predict(start_=3, **predict_context)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.872251566146665" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.score(y_test, predictions)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/tutorials/6. Flexible outputs specification.ipynb b/examples/tutorials/6. Flexible outputs specification.ipynb new file mode 100644 index 00000000..3dc3686f --- /dev/null +++ b/examples/tutorials/6. Flexible outputs specification.ipynb @@ -0,0 +1,517 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Flexible outputs specification\n", + "\n", + "In a previous tutorial we have learnt how to obtain intermediate pipeline\n", + "outputs in order to debug its internal behavior.\n", + "\n", + "In this guide we will go a bit further and learn how to define flexible outputs\n", + "for the pipeline in order to obtain the output of multiple primitives\n", + "at once.\n", + "\n", + "Note that some steps are not explained for simplicity. Full details\n", + "about them can be found in the previous parts of the tutorial.\n", + "\n", + "We will:\n", + "\n", + "1. Load a pipeline and a dataset\n", + "2. Explore the output specification formats" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load a pipeline and a datset\n", + "\n", + "The first step will be to load the Census dataset and the pipeline that we will be using." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from mlprimitives.datasets import load_dataset\n", + "\n", + "dataset = load_dataset('census')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = dataset.get_splits(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from mlblocks import MLPipeline\n", + "\n", + "primitives = [\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'xgboost.XGBClassifier',\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder'\n", + "]\n", + "pipeline = MLPipeline(primitives)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Also, just as a reminder, let's have a quick look at the steps of this pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['mlprimitives.custom.preprocessing.ClassEncoder',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'xgboost.XGBClassifier',\n", + " 'mlprimitives.custom.preprocessing.ClassDecoder']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.primitives" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And at the `X` and `y` variables that we will be passing to our pipeline." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`X` is a `pandas.DataFrame` that conatins the demographics data of the subjects:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-country
2829125Private193379Assoc-acdm12Never-marriedCraft-repairNot-in-familyWhiteMale0045United-States
2863655Federal-gov176904HS-grad9Married-civ-spouseExec-managerialHusbandWhiteMale0040United-States
791930Private284395HS-grad9Married-civ-spouseCraft-repairHusbandWhiteMale0050United-States
2486117Private23934610th6Never-marriedOther-serviceOwn-childWhiteMale0018United-States
2348051Private57698HS-grad9Married-spouse-absentOther-serviceUnmarriedWhiteFemale0040United-States
\n", + "
" + ], + "text/plain": [ + " age workclass fnlwgt education education-num \\\n", + "28291 25 Private 193379 Assoc-acdm 12 \n", + "28636 55 Federal-gov 176904 HS-grad 9 \n", + "7919 30 Private 284395 HS-grad 9 \n", + "24861 17 Private 239346 10th 6 \n", + "23480 51 Private 57698 HS-grad 9 \n", + "\n", + " marital-status occupation relationship race \\\n", + "28291 Never-married Craft-repair Not-in-family White \n", + "28636 Married-civ-spouse Exec-managerial Husband White \n", + "7919 Married-civ-spouse Craft-repair Husband White \n", + "24861 Never-married Other-service Own-child White \n", + "23480 Married-spouse-absent Other-service Unmarried White \n", + "\n", + " sex capital-gain capital-loss hours-per-week native-country \n", + "28291 Male 0 0 45 United-States \n", + "28636 Male 0 0 40 United-States \n", + "7919 Male 0 0 50 United-States \n", + "24861 Male 0 0 18 United-States \n", + "23480 Female 0 0 40 United-States " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And `y` is a `numpy.ndarray` that contains the label that indicates whether the subject has a salary\n", + "above or under 50K." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K'], dtype=object)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train[0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore the output specification formats\n", + "\n", + "In the previous tutorial we learnt that the output of a pipeline can be specified\n", + "in multiple formats:\n", + "\n", + "* An integer indicating the pipeline step index, which will return us the complete\n", + " context after producing the corresponding step.\n", + "* A string indicating the name of a step, which will also return us the complete\n", + " context after producing the corresponding step.\n", + " \n", + "A part from these two options, there are a few more.\n", + "\n", + "### Single variable specification\n", + "\n", + "Variables can be individually specified by passing a string in the format\n", + "`{pipeline-step-name}.{variable-name}`.\n", + "\n", + "Note that the `pipeline-step-name` part is not only the primitive name, but\n", + "also the counter number at the end of it.\n", + "\n", + "For example, if we want to explore the `classes` variable generated by\n", + "the `ClassEncoder` primitive during `fit`, we can do the following:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' <=50K', ' >50K'], dtype=object)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_spec = 'mlprimitives.custom.preprocessing.ClassEncoder#1.classes'\n", + "pipeline.fit(X_train, y_train, output_=output_spec)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**NOTE**: Just like with the full context specification, when a variable is specified\n", + "the pipeline will be executed only up to the step that produces the indicated variable." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List of variables\n", + "\n", + "In some cases we will be interested in obtaining more than one variable\n", + "at a time.\n", + "\n", + "In order to do this, instead of a single string specification we can pass\n", + "a list of strings." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "output_spec = [\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder#1.y',\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder#1.classes',\n", + "]\n", + "out = pipeline.fit(X_train, y_train, output_=output_spec)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output will be a `tuple` containing the variables in the specified order." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "y, classes = out" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we want to obtain variables from multiple pipeline steps we simply need\n", + "to specify all of them at once. Again, **MLBlocks** will run all the necessary\n", + "pipeline steps, accumulating the desired variables up to the last step needed." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "output_spec = [\n", + " 'sklearn.impute.SimpleImputer#1.X',\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder#1.y',\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder#1.classes',\n", + "]\n", + "X, y, classes = pipeline.fit(X_train, y_train, output_=output_spec)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If required, we can even capture the same variable along the different pipeline steps!" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "output_spec = [\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1.X',\n", + " 'sklearn.impute.SimpleImputer#1.X',\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder#1.y',\n", + " 'mlprimitives.custom.preprocessing.ClassEncoder#1.classes',\n", + "]\n", + "X_1, X_2, y, classes = pipeline.fit(X_train, y_train, output_=output_spec)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(24420, 108)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_1.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(24420, 108)" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_2.shape" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/tutorials/5. Tuning a Pipeline.ipynb b/examples/tutorials/7. Tuning a Pipeline.ipynb similarity index 100% rename from examples/tutorials/5. Tuning a Pipeline.ipynb rename to examples/tutorials/7. Tuning a Pipeline.ipynb diff --git a/examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb similarity index 100% rename from examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb rename to examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb From 03c7a2d07d15f6e69e448e72860fc4b18ad60ac9 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 16 Feb 2020 17:30:57 -0500 Subject: [PATCH 085/160] Simplify README and make docs based on it --- README.md | 190 +++++----------------------- docs/conf.py | 4 +- docs/getting_started/install.rst | 57 --------- docs/getting_started/quickstart.rst | 125 ------------------ docs/index.rst | 54 +------- docs/readme.rst | 1 + 6 files changed, 37 insertions(+), 394 deletions(-) delete mode 100644 docs/getting_started/install.rst delete mode 100644 docs/getting_started/quickstart.rst create mode 100644 docs/readme.rst diff --git a/README.md b/README.md index 7c152fa3..f3a6e3d7 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@

-“MLBlocksr” +“DAI-Lab” An open source project from Data to AI Lab at MIT.

@@ -16,12 +16,12 @@ Pipelines and Primitives for Machine Learning and Data Science. [![CodeCov](https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/HDI-Project/MLBlocks) [![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks) +# Overview + * Free software: MIT license * Documentation: https://HDI-Project.github.io/MLBlocks * Homepage: https://github.com/HDI-Project/MLBlocks -# MLBlocks - MLBlocks is a simple framework for composing end-to-end tunable Machine Learning Pipelines by seamlessly combining tools from any python library with a simple, common and uniform interface. @@ -47,25 +47,10 @@ Also, although it is not strictly required, the usage of a [virtualenv](https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid interfering with other software installed in the system where **MLBlocks** is run. -These are the minimum commands needed to create a virtualenv using python3.6 for **MLBlocks**: - -```bash -pip install virtualenv -virtualenv -p $(which python3.6) mlblocks-venv -``` - -Afterwards, you have to execute this command to have the virtualenv activated: - -```bash -source mlblocks-venv/bin/activate -``` - -Remember about executing it every time you start a new console to work on **MLBlocks**! - ## Install with pip -After creating the virtualenv and activating it, we recommend using -[pip](https://pip.pypa.io/en/stable/) in order to install **MLBlocks**: +The easiest and recommended way to install **MLBlocks** is using [pip]( +https://pip.pypa.io/en/stable/): ```bash pip install mlblocks @@ -73,46 +58,8 @@ pip install mlblocks This will pull and install the latest stable release from [PyPi](https://pypi.org/). -## Install from source - -Alternatively, with your virtualenv activated, you can clone the repository and install it from -source by running `make install` on the `stable` branch: - -```bash -git clone git@github.com:HDI-Project/MLBlocks.git -cd MLBlocks -git checkout stable -make install -``` - -## Install for Development - -If you want to contribute to the project, a few more steps are required to make the project ready -for development. - -First, please head to [the GitHub page of the project](https://github.com/HDI-Project/MLBlocks) -and make a fork of the project under you own username by clicking on the **fork** button on the -upper right corner of the page. - -Afterwards, clone your fork and create a branch from master with a descriptive name that includes -the number of the issue that you are going to work on: - -```bash -git clone git@github.com:{your username}/MLBlocks.git -cd MLBlocks -git branch issue-xx-cool-new-feature master -git checkout issue-xx-cool-new-feature -``` - -Finally, install the project with the following command, which will install some additional -dependencies for code linting and testing. - -```bash -make install-develop -``` - -Make sure to use them regularly while developing by running the commands `make lint` and `make test`. - +If you want to install from source or contribute to the project please read the +[Contributing Guide](https://hdi-project.github.io/MLBlocks/contributing.html#get-started). ## MLPrimitives @@ -128,118 +75,43 @@ pip install mlprimitives # Quickstart -Below there is a short example about how to use MLBlocks to create a simple pipeline, fit it -using demo data and use it to make predictions. +Below there is a short example about how to use **MLBlocks** to solve a prediction problem +using the primitives and pipelines from [MLPrimitives](https://github.com/HDI-Project/MLPrimitives). -Please make sure to also having installed [MLPrimitives](https://github.com/HDI-Project/MLPrimitives) -before following it. +```python3 +from mlblocks import MLPipeline +from mlprimitives.datasets import load_dataset -For advance usage and more detailed explanation about each component, please have a look -at the [documentation](https://HDI-Project.github.io/MLBlocks) +dataset = load_dataset('census') +X_train, X_test, y_train, y_test = dataset.get_splits(1) -## Creating a pipeline +primitives = [ + 'mlprimitives.custom.preprocessing.ClassEncoder', + 'mlprimitives.custom.feature_extraction.CategoricalEncoder', + 'sklearn.impute.SimpleImputer', + 'xgboost.XGBClassifier', + 'mlprimitives.custom.preprocessing.ClassDecoder' +] +pipeline = MLPipeline(primitives) -With MLBlocks, creating a pipeline is as simple as specifying a list of primitives and passing -them to the `MLPipeline` class. +pipeline.fit(X_train, y_train) +predictions = pipeline.predict(X_test) -```python ->>> from mlblocks import MLPipeline -... primitives = [ -... 'cv2.GaussianBlur', -... 'skimage.feature.hog', -... 'sklearn.ensemble.RandomForestClassifier' -... ] ->>> pipeline = MLPipeline(primitives) -``` - -Optionally, specific initialization arguments can be also set by specifying them in a dictionary: - -```python ->>> init_params = { -... 'skimage.feature.hog': { -... 'multichannel': True, -... 'visualize': False -... }, -... 'sklearn.ensemble.RandomForestClassifier': { -... 'n_estimators': 100, -... } -... } ->>> pipeline = MLPipeline(primitives, init_params=init_params) -``` - -If you can see which hyperparameters a particular pipeline is using, you can do so by calling -its `get_hyperparameters` method: - -```python ->>> import json ->>> hyperparameters = pipeline.get_hyperparameters() ->>> print(json.dumps(hyperparameters, indent=4)) -{ - "cv2.GaussianBlur#1": { - "ksize_width": 3, - "ksize_height": 3, - "sigma_x": 0, - "sigma_y": 0 - }, - "skimage.feature.hog#1": { - "multichannel": true, - "visualize": false, - "orientations": 9, - "pixels_per_cell_x": 8, - "pixels_per_cell_y": 8, - "cells_per_block_x": 3, - "cells_per_block_y": 3, - "block_norm": null - }, - "sklearn.ensemble.RandomForestClassifier#1": { - "n_jobs": -1, - "n_estimators": 100, - "criterion": "entropy", - "max_features": null, - "max_depth": 10, - "min_samples_split": 0.1, - "min_samples_leaf": 0.1, - "class_weight": null - } -} -``` - -## Making predictions - -Once we have created the pipeline with the desired hyperparameters we can fit it -and then use it to make predictions on new data. - -To do this, we first call the `fit` method passing the training data and the corresponding labels. - -In this case in particular, we will be loading the handwritten digit classification dataset -from USPS using the `mlblocks.datasets.load_usps` method, which returns a dataset object -ready to be played with. - -```python ->>> from mlblocks.datasets import load_usps ->>> dataset = load_usps() ->>> X_train, X_test, y_train, y_test = dataset.get_splits(1) ->>> pipeline.fit(X_train, y_train) -``` - -Once we have fitted our model to our data, we can call the `predict` method passing new data -to obtain predictions from the pipeline. - -```python ->>> predictions = pipeline.predict(X_test) ->>> predictions -array([3, 2, 1, ..., 1, 1, 2]) +dataset.score(y_test, predictions) ``` # What's Next? If you want to learn more about how to tune the pipeline hyperparameters, save and load the pipelines using JSON annotations or build complex multi-branched pipelines, please -check our [documentation](https://HDI-Project.github.io/MLBlocks). +check our [documentation site](https://HDI-Project.github.io/MLBlocks). + +Also do not forget to have a look at the [notebook tutorials]( +https://github.com/D3-AI/GreenGuard/tree/master/examples/tutorials)! -## Citing MLBlocks +# Citing MLBlocks -If you use MLBlocks, please consider citing our related papers. +If you use MLBlocks for your research, please consider citing our related papers. For the current design of MLBlocks and its usage within the larger *Machine Learning Bazaar* project at the MIT Data To AI Lab, please see: diff --git a/docs/conf.py b/docs/conf.py index 95653914..5ff266d0 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -27,7 +27,6 @@ # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' - # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ @@ -47,7 +46,6 @@ } ipython_execlines = ["import pandas as pd", "pd.set_option('display.width', 1000000)"] - # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -116,7 +114,7 @@ # documentation. html_theme_options = { 'collapse_navigation': False, - 'display_version': False, + 'display_version': True, } # Add any paths that contain custom static files (such as style sheets) here, diff --git a/docs/getting_started/install.rst b/docs/getting_started/install.rst deleted file mode 100644 index 4163f3bd..00000000 --- a/docs/getting_started/install.rst +++ /dev/null @@ -1,57 +0,0 @@ -.. highlight:: shell - -Installation -============ - -From PyPi ---------- - -The simplest and recommended way to install MLBlocks is using `pip`: - -.. code-block:: console - - pip install mlblocks - -If you don't have `pip`_ installed, this `Python installation guide`_ can guide -you through the process. - -.. _pip: https://pip.pypa.io -.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ - -From sources ------------- - -The sources for MLBlocks can be downloaded from the `Github repo`_. - -You can either clone the public repository: - -.. code-block:: console - - git clone git://github.com/HDI-Project/MLBlocks - -Or download the `tarball`_: - -.. code-block:: console - - curl -OL https://github.com/HDI-Project/MLBlocks/tarball/master - -Once you have a copy of the source, you can install it running the next command inside the -project folder: - -.. code-block:: console - - $ make install - -.. _Github repo: https://github.com/HDI-Project/MLBlocks -.. _tarball: https://github.com/HDI-Project/MLBlocks/tarball/master - -Development ------------ - -If you are installing **MLBlocks** in order to modify its code, the installation must be done -from its sources, in the editable mode, and also including some additional dependencies in -order to be able to run the tests and build the documentation: - -.. code-block:: console - - make install-develop diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst deleted file mode 100644 index 31be89ee..00000000 --- a/docs/getting_started/quickstart.rst +++ /dev/null @@ -1,125 +0,0 @@ -Quickstart -========== - -Below is a short tutorial that will show you how to get started using **MLBlocks**. - -In this tutorial we will learn how to: - -* Create a pipeline using multiple primitives -* Obtain the list of tunable hyperparameters from the pipeline -* Specify hyperparameters for each primitive in the pipeline -* Fit the pipeline using training data -* Use the pipeline to make predictions from new data - -.. note:: Some additional dependencies are required in order to run this Quickstart. - Make sure that `you have already installed them`_. - -Creating a pipeline -------------------- - -With MLBlocks, creating a pipeline is as simple as specifying a list of primitives and passing -them to the `MLPipeline class`_: - -.. ipython:: python - - from mlblocks import MLPipeline - primitives = [ - 'mlprimitives.custom.feature_extraction.CategoricalEncoder', - 'mlprimitives.custom.feature_extraction.StringVectorizer', - 'sklearn.ensemble.RandomForestClassifier', - ] - pipeline = MLPipeline(primitives) - -Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary and -passing them as the ``init_params`` argument: - -.. ipython:: python - - init_params = { - 'sklearn.ensemble.RandomForestClassifier': { - 'n_estimators': 100 - } - } - pipeline = MLPipeline(primitives, init_params=init_params) - -Once the pipeline has been instantiated, we can easily see what `hyperparameters`_ have been set -for each block, by calling the `get_hyperparameters method`_. - -The output of this method is a dictionary which has the name of each block as keys and -a dictionary with the `hyperparameters`_ of the corresponding block as values. - -.. ipython:: python - - pipeline.get_hyperparameters() - -Tunable Hyperparameters ------------------------ - -One of the main features of `MLBlocks JSON Annotations`_ is the possibility to indicate -the type and possible values that each primitive hyperparameter accepts. - -The list of possible hyperparameters and their details can easily be obtained from the pipeline -instance by calling its `get_tunable_hyperparameters method`_. - -The output of this method is a dictionary that contains the list of tunable hyperparameters -for each block in the pipeline, ready to be passed to any hyperparameter tuning library such -as `BTB`_. - -.. ipython:: python - - pipeline.get_tunable_hyperparameters() - -Setting Hyperparameters ------------------------ - -Modifying the hyperparameters of an already instantiated pipeline can be done using the -`set_hyperparameters method`_, which expects a dictionary with the same format as the returned -by the `get_hyperparameters method`_. - -Note that if a subset of the hyperparameters is passed, only these will be modified, and the -other ones will remain unmodified. - -.. ipython:: python - - new_hyperparameters = { - 'sklearn.ensemble.RandomForestClassifier#1': { - 'max_depth': 15 - } - } - pipeline.set_hyperparameters(new_hyperparameters) - hyperparameters = pipeline.get_hyperparameters() - hyperparameters['sklearn.ensemble.RandomForestClassifier#1']['max_depth'] - -Making predictions ------------------- - -Once we have created the pipeline with the desired hyperparameters we can fit it -and then use it to make predictions on new data. - -To do this, we first call the ``fit`` method passing the training data and the corresponding -labels. - -.. ipython:: python - - from mlblocks.datasets import load_personae - dataset = load_personae() - X_train, X_test, y_train, y_test = dataset.get_splits(1) - pipeline.fit(X_train, y_train) - -Once we have fitted our model to our data, we can call the ``predict`` method passing new data -to obtain predictions from the pipeline. - -.. ipython:: python - - predictions = pipeline.predict(X_test) - predictions - dataset.score(y_test, predictions) - -.. _you have already installed them: install.html#additional-dependencies -.. _MLPipeline class: ../api_reference.html#mlblocks.MLPipeline -.. _get_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_hyperparameters -.. _hyperparameters: ../advanced_usage/hyperparameters.html -.. _MLBlocks JSON Annotations: ../advanced_usage/primitives.html#json-annotations -.. _get_tunable_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_tunable_hyperparameters -.. _BTB: https://github.com/HDI-Project/BTB -.. _set_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.set_hyperparameters diff --git a/docs/index.rst b/docs/index.rst index c3655b3c..7a6fa800 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,53 +1,10 @@ -What is MLBlocks? -================= - -.. image:: images/mlblocks-logo.png - :width: 300 px - :alt: MLBlocks - :align: center - -MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning -tools developed in Python, whether they are custom developments or belong to third party -libraries, and build Pipelines out of them that can be fitted and then used to make predictions. - -This is achieved by providing a simple and intuitive annotation language that allows the -user to specify how to integrate with each tool, here called primitives, in order to provide -a common uniform interface to each one of them. - -At a high level: - -* Each available primitive has been annotated using a standardized JSON file that specifies its - native interface, as well as which hyperparameters can be used to tune its behavior. -* A list of primitives that will be combined into a pipeline is provided by the user, optionally - passing along the hyperparameters to use for each primitive. -* An MLBlock instance is build for each primitive, offering a common interface for all of them. -* The MLBlock instances are then combined into an MLPipeline instance, able to run them all in - the right order, passing the output from each one as input to the next one. -* The training data is passed to the `MLPipeline.fit` method, which sequentially fits each - MLBlock instance following the JSON annotation specification. -* The data used to make predictions is passed to the `MLPipeline.predict` method, which uses each - MLBlock sequentially to obtain the desired predictions. - -History -------- - -In its first iteration in 2015, MLBlocks was designed for only multi table, multi entity temporal -data. A good reference to see our design rationale at that time is Bryan Collazo’s thesis: - -* `Machine learning blocks`_. - Bryan Collazo. Masters thesis, MIT EECS, 2015. - -With recent availability of a multitude of libraries and tools, we decided it was time to integrate -them and expand the library to address other data types: images, text, graph, time series and -integrate with deep learning libraries. +.. include:: readme.rst .. toctree:: - :caption: Getting Started - :titlesonly: + :hidden: + :maxdepth: 2 - self - getting_started/install - getting_started/quickstart + Overview .. toctree:: :caption: Advanced Usage @@ -89,6 +46,3 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` * :ref:`search` - -.. _Machine learning blocks: https://github.com/HDI-Project/mlblocks -.. _tarball: https://github.com/HDI-Project/mlblocks/tarball/master diff --git a/docs/readme.rst b/docs/readme.rst new file mode 100644 index 00000000..97d49585 --- /dev/null +++ b/docs/readme.rst @@ -0,0 +1 @@ +.. mdinclude:: ../README.md From 753426e5c2ec994fe8f9ca9ab928dde9380f9bf0 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 16 Feb 2020 17:34:55 -0500 Subject: [PATCH 086/160] Update devel dependencies --- .../tutorials/3. Setting MLPipeline Hyperparameters.ipynb | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb index 0914e806..725226f7 100644 --- a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb +++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# 2. Setting MLPipeline Hyperparameters\n", + "# Setting MLPipeline Hyperparameters\n", "\n", "In this short guide we will see how to modify the hyperparameters\n", "of an MLPipeline in order to modify its behavior or performance.\n", @@ -429,7 +429,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.9" } }, "nbformat": 4, diff --git a/setup.py b/setup.py index 6045c574..ddb0081e 100644 --- a/setup.py +++ b/setup.py @@ -92,7 +92,7 @@ ], description="Pipelines and primitives for machine learning and data science.", extras_require={ - 'dev': development_requires + tests_require, + 'dev': development_requires + tests_require + examples_require, 'test': tests_require, 'examples': examples_require, }, From cd68389890109d055d05eac3ba9aefbd6e94ad1f Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 16 Feb 2020 17:36:59 -0500 Subject: [PATCH 087/160] Rename notebook --- ...eline-Copy1.ipynb => 4. Saving and Loading a Pipeline.ipynb} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename examples/tutorials/{4. Saving and Loading a Pipeline-Copy1.ipynb => 4. Saving and Loading a Pipeline.ipynb} (99%) diff --git a/examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb similarity index 99% rename from examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb rename to examples/tutorials/4. Saving and Loading a Pipeline.ipynb index f8a0a5b3..01a58cd5 100644 --- a/examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb +++ b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb @@ -180,7 +180,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.9" } }, "nbformat": 4, From 6e31824e61420038e9a180c0330ab2f745dbd2a2 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 16 Feb 2020 18:15:23 -0500 Subject: [PATCH 088/160] Test readme using rundoc --- Makefile | 4 ++++ README.md | 2 +- setup.py | 8 +++++--- tox.ini | 8 +++++++- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index bfc1a5f6..eb422682 100644 --- a/Makefile +++ b/Makefile @@ -114,6 +114,10 @@ lint-docs: ## check docs formatting with doc8 and pydocstyle test: ## run tests quickly with the default Python python -m pytest --cov=mlblocks +.PHONY: test-readme +test-readme: ## run the readme snippets + rundoc run --single-session python3 -t python3 README.md + .PHONY: test-all test-all: ## run tests on every Python version with tox tox -r diff --git a/README.md b/README.md index f3a6e3d7..3f13fec0 100644 --- a/README.md +++ b/README.md @@ -120,7 +120,7 @@ Micah J. Smith, Carles Sala, James Max Kanter, and Kalyan Veeramachaneni. ["The Harnessing the ML Ecosystem for Effective System Development."](https://arxiv.org/abs/1905.08942) arXiv Preprint 1905.08942. 2019. -``` bibtex +```bibtex @article{smith2019mlbazaar, author = {Smith, Micah J. and Sala, Carles and Kanter, James Max and Veeramachaneni, Kalyan}, title = {The Machine Learning Bazaar: Harnessing the ML Ecosystem for Effective System Development}, diff --git a/setup.py b/setup.py index ddb0081e..b6ba498e 100644 --- a/setup.py +++ b/setup.py @@ -28,10 +28,12 @@ 'pytest>=3.4.2', 'pytest-cov>=2.6.0', 'mlprimitives>=0.2,<0.3', - 'urllib3>=1.20,<1.25', - 'setuptools>=41.0.0', + # 'urllib3>=1.20,<1.25', + # 'setuptools>=41.0.0', 'numpy<1.17', - 'python-dateutil<2.8.1,>=2.1', + # 'python-dateutil<2.8.1,>=2.1', + 'rundoc>=0.4.3', + 'prompt-toolkit>=2.0,<3.0', ] diff --git a/tox.ini b/tox.ini index 666eeab0..1b8a777e 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py35, py36, lint, docs +envlist = py35, py36, lint, docs, readme [travis] @@ -29,3 +29,9 @@ skipsdist = true extras = dev commands = /usr/bin/env make docs + + +[testenv:readme] +skipsdist = true +commands = + /usr/bin/env make test-readme From 507564de001731915692ee698aa33eda49318b75 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 16 Feb 2020 18:22:58 -0500 Subject: [PATCH 089/160] Fix dependencies --- setup.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/setup.py b/setup.py index b6ba498e..a4fcc7a3 100644 --- a/setup.py +++ b/setup.py @@ -28,10 +28,8 @@ 'pytest>=3.4.2', 'pytest-cov>=2.6.0', 'mlprimitives>=0.2,<0.3', - # 'urllib3>=1.20,<1.25', - # 'setuptools>=41.0.0', + 'setuptools>=41.0.0', 'numpy<1.17', - # 'python-dateutil<2.8.1,>=2.1', 'rundoc>=0.4.3', 'prompt-toolkit>=2.0,<3.0', ] From 3169f7ac4911b272d5d23dd89edaa347298dfc71 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 16 Feb 2020 18:39:53 -0500 Subject: [PATCH 090/160] Fix readme aspect in the docs --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3f13fec0..793c55f5 100644 --- a/README.md +++ b/README.md @@ -16,12 +16,16 @@ Pipelines and Primitives for Machine Learning and Data Science. [![CodeCov](https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/HDI-Project/MLBlocks) [![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks) -# Overview +--- + +# MLBlocks * Free software: MIT license * Documentation: https://HDI-Project.github.io/MLBlocks * Homepage: https://github.com/HDI-Project/MLBlocks +## Overview + MLBlocks is a simple framework for composing end-to-end tunable Machine Learning Pipelines by seamlessly combining tools from any python library with a simple, common and uniform interface. From dd4e7cc7a3f95792ac47f93a42f7816eb98ce1f8 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 16 Feb 2020 18:42:36 -0500 Subject: [PATCH 091/160] Fix README header --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 793c55f5..3d8a02cb 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Pipelines and Primitives for Machine Learning and Data Science. [![CodeCov](https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/HDI-Project/MLBlocks) [![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks) ---- +
# MLBlocks From 9406c65f1fea6bee3441351dba93b81893a0e3f9 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 16 Feb 2020 18:45:17 -0500 Subject: [PATCH 092/160] Remove misleading point --- examples/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/README.md b/examples/README.md index 12131c95..d295414e 100644 --- a/examples/README.md +++ b/examples/README.md @@ -5,7 +5,7 @@ functionaliry. Within this folder you will find: -* `examples.py`: Simple Python code examples of a class and a function based primitive implementation. + * `primitives`: Example primitive JSONs to demonstrate different MLBlocks functionalities. * `pipelines`: Example pipeline JSONs to demonstrate different MLBlocks functionalities. * `tutorials`: Collection of Jupyter Notebooks to show the usage of different MLBlocks functionalities. @@ -45,7 +45,7 @@ cd MLBlocks make install-examples ``` -This will install [MLBLocks](https://github.com/HDI-Project/MLBlocks.git) and also [MLPrimitives]( +This will install [MLBLocks](https://github.com/HDI-Project/MLBlocks.git) as well as [MLPrimitives]( https://github.com/HDI-Project/MLPrimitives.git) and [Jupyter](https://jupyter.org/). 4. Enter the `examples` folder and start a Jupyter Notebook: From e48362685ce9355e4775490e93208e31e2c6278a Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 16 Feb 2020 18:54:24 -0500 Subject: [PATCH 093/160] Fix link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3d8a02cb..3d3e21cd 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,7 @@ the pipelines using JSON annotations or build complex multi-branched pipelines, check our [documentation site](https://HDI-Project.github.io/MLBlocks). Also do not forget to have a look at the [notebook tutorials]( -https://github.com/D3-AI/GreenGuard/tree/master/examples/tutorials)! +https://github.com/HDI-Project/MLBlocks/tree/master/examples/tutorials)! # Citing MLBlocks From 60b5e425e844ee49dd1d6bd0b63e758cab0bbc6e Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 17 Feb 2020 11:17:00 -0500 Subject: [PATCH 094/160] Improve docs quickstart and introduction --- README.md | 2 +- docs/index.rst | 73 ++++++++++++++++++++++++++++++++++++++++++++++--- docs/readme.rst | 1 - 3 files changed, 70 insertions(+), 6 deletions(-) delete mode 100644 docs/readme.rst diff --git a/README.md b/README.md index 3d3e21cd..0f54b440 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Pipelines and Primitives for Machine Learning and Data Science. # MLBlocks -* Free software: MIT license +* Free software: [MIT license](https://github.com/HDI-Project/MLBlocks/blob/master/LICENSE) * Documentation: https://HDI-Project.github.io/MLBlocks * Homepage: https://github.com/HDI-Project/MLBlocks diff --git a/docs/index.rst b/docs/index.rst index 7a6fa800..e891230c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,10 +1,70 @@ -.. include:: readme.rst +What is MLBlocks? +================= + +.. image:: images/mlblocks-logo.png + :width: 300 px + :alt: MLBlocks + :align: center + +* Free software: `MIT license `_ +* Documentation: https://HDI-Project.github.io/MLBlocks +* Homepage: https://github.com/HDI-Project/MLBlocks + +MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning +tools developed in Python, whether they are custom developments or belong to third party +libraries, and build Pipelines out of them that can be fitted and then used to make predictions. + +This is achieved by providing a simple and intuitive annotation language that allows the +user to specify how to integrate with each tool, here called primitives, in order to provide +a common uniform interface to each one of them. + +At a high level: + +* Each available primitive has been annotated using a standardized JSON file that specifies its + native interface, as well as which hyperparameters can be used to tune its behavior. +* A list of primitives that will be combined into a pipeline is provided by the user, optionally + passing along the hyperparameters to use for each primitive. +* An MLBlock instance is build for each primitive, offering a common interface for all of them. +* The MLBlock instances are then combined into an MLPipeline instance, able to run them all in + the right order, passing the output from each one as input to the next one. +* The training data is passed to the `MLPipeline.fit` method, which sequentially fits each + MLBlock instance following the JSON annotation specification. +* The data used to make predictions is passed to the `MLPipeline.predict` method, which uses each + MLBlock sequentially to obtain the desired predictions. + +History +------- + +In its first iteration, in 2015, MLBlocks was designed for only multi table, multi entity temporal +data. A good reference to see our design rationale at that time is Bryan Collazo’s thesis, written +under the supervision of Kalyan Veeramachaneni: + +* `Machine learning blocks`_. + Bryan Collazo. Masters thesis, MIT EECS, 2015. + +In 2018, with recent availability of a multitude of libraries and tools, we decided it was time to +integrate them and expand the library to address other data types, like images, text, graph or +time series, as well as introduce the usage of deep learning libraries. A second iteration of our +work was then started by the hand of William Xue: + +* `A Flexible Framework for Composing End to End Machine Learning Pipelines`_. + William Xue. Masters thesis, MIT EECS, 2018. + +Later in 2018, Carles Sala joined the project to make it grow as a reliable open-source library +that would become part of a bigger software ecosystem designed to facilitate the development of +robust end-to-end solutions based on Machine Learning tools. This third iteration of our work +was presented in 2019 as part of the Machine Learning Bazaar: + +* `The Machine Learning Bazaar: Harnessing the ML Ecosystem for Effective System Development`_. + Micah J. Smith, Carles Sala, James Max Kanter, and Kalyan Veeramachaneni. Sigmod 2020. .. toctree:: - :hidden: - :maxdepth: 2 + :caption: Getting Started + :titlesonly: - Overview + self + getting_started/install + getting_started/quickstart .. toctree:: :caption: Advanced Usage @@ -46,3 +106,8 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` * :ref:`search` + +.. _Machine learning blocks: https://dai.lids.mit.edu/wp-content/uploads/2018/06/Mlblocks_Bryan.pdf + +.. _A Flexible Framework for Composing End to End Machine Learning Pipelines: https://dai.lids.mit.edu/wp-content/uploads/2018/12/William_MEng.pdf +.. _The Machine Learning Bazaar\: Harnessing the ML Ecosystem for Effective System Development: https://arxiv.org/abs/1905.08942 diff --git a/docs/readme.rst b/docs/readme.rst deleted file mode 100644 index 97d49585..00000000 --- a/docs/readme.rst +++ /dev/null @@ -1 +0,0 @@ -.. mdinclude:: ../README.md From be97f0597fbbcfceb2db7643550e4f45502b46a2 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 17 Feb 2020 11:58:37 -0500 Subject: [PATCH 095/160] Add missing docs --- docs/getting_started/install.rst | 43 ++++++++++ docs/getting_started/quickstart.rst | 127 ++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+) create mode 100644 docs/getting_started/install.rst create mode 100644 docs/getting_started/quickstart.rst diff --git a/docs/getting_started/install.rst b/docs/getting_started/install.rst new file mode 100644 index 00000000..d2bda921 --- /dev/null +++ b/docs/getting_started/install.rst @@ -0,0 +1,43 @@ +.. highlight:: shell + +Installation +============ + +From PyPi +--------- + +The simplest and recommended way to install MLBlocks is using `pip`: + +.. code-block:: console + + pip install mlblocks + +If you don't have `pip`_ installed, this `Python installation guide`_ can guide +you through the process. + +.. _pip: https://pip.pypa.io +.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ + +Additional dependencies +----------------------- + +In order to be usable, MLBlocks requires a compatible primitives library. + +The official library, required in order to follow the MLBlocks tutorials and documentation examples, +is `MLPrimitives`_, which you can install with this command: + +.. code-block:: console + + pip install mlprimitives + +.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives + +Install for development +----------------------- + +If you are installing **MLBlocks** in order to modify its code, the installation must be done +from its sources, in the editable mode, and also including some additional dependencies in +order to be able to run the tests and build the documentation. Instructions about this process +can be found in the `Contributing guide`_. + +.. _Contributing guide: ../contributing.html#get-started diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst new file mode 100644 index 00000000..b55223dd --- /dev/null +++ b/docs/getting_started/quickstart.rst @@ -0,0 +1,127 @@ +Quickstart +========== + +Below is a short tutorial that will show you how to get started using **MLBlocks**. + +In this tutorial we will learn how to: + +* Create a pipeline using multiple primitives +* Obtain the list of tunable hyperparameters from the pipeline +* Specify hyperparameters for each primitive in the pipeline +* Fit the pipeline using training data +* Use the pipeline to make predictions from new data + +.. note:: Some additional dependencies are required in order to run this Quickstart. + Make sure that `you have already installed them`_. + +Creating a pipeline +------------------- + +With MLBlocks, creating a pipeline is as simple as specifying a list of primitives and passing +them to the `MLPipeline class`_: + +.. ipython:: python + + from mlblocks import MLPipeline + primitives = [ + 'mlprimitives.custom.preprocessing.ClassEncoder', + 'mlprimitives.custom.feature_extraction.CategoricalEncoder', + 'sklearn.impute.SimpleImputer', + 'xgboost.XGBClassifier', + 'mlprimitives.custom.preprocessing.ClassDecoder' + ] + pipeline = MLPipeline(primitives) + +Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary and +passing them as the ``init_params`` argument: + +.. ipython:: python + + init_params = { + 'sklearn.impute.SimpleImputer': { + 'strategy': 'median' + } + } + pipeline = MLPipeline(primitives, init_params=init_params) + +Once the pipeline has been instantiated, we can easily see what `hyperparameters`_ have been set +for each block, by calling the `get_hyperparameters method`_. + +The output of this method is a dictionary which has the name of each block as keys and +a dictionary with the `hyperparameters`_ of the corresponding block as values. + +.. ipython:: python + + pipeline.get_hyperparameters() + +Tunable Hyperparameters +----------------------- + +One of the main features of `MLBlocks JSON Annotations`_ is the possibility to indicate +the type and possible values that each primitive hyperparameter accepts. + +The list of possible hyperparameters and their details can easily be obtained from the pipeline +instance by calling its `get_tunable_hyperparameters method`_. + +The output of this method is a dictionary that contains the list of tunable hyperparameters +for each block in the pipeline, ready to be passed to any hyperparameter tuning library such +as `BTB`_. + +.. ipython:: python + + pipeline.get_tunable_hyperparameters() + +Setting Hyperparameters +----------------------- + +Modifying the hyperparameters of an already instantiated pipeline can be done using the +`set_hyperparameters method`_, which expects a dictionary with the same format as the returned +by the `get_hyperparameters method`_. + +Note that if a subset of the hyperparameters is passed, only these will be modified, and the +other ones will remain unmodified. + +.. ipython:: python + + new_hyperparameters = { + 'xgboost.XGBClassifier#1': { + 'max_depth': 15 + } + } + pipeline.set_hyperparameters(new_hyperparameters) + hyperparameters = pipeline.get_hyperparameters() + hyperparameters['xgboost.XGBClassifier#1']['max_depth'] + +Making predictions +------------------ + +Once we have created the pipeline with the desired hyperparameters we can fit it +and then use it to make predictions on new data. + +To do this, we first call the ``fit`` method passing the training data and the corresponding +labels. + +.. ipython:: python + + from mlprimitives.datasets import load_census + dataset = load_census() + X_train, X_test, y_train, y_test = dataset.get_splits(1) + pipeline.fit(X_train, y_train) + +Once we have fitted our model to our data, we can call the ``predict`` method passing new data +to obtain predictions from the pipeline. + +.. ipython:: python + + predictions = pipeline.predict(X_test) + predictions + dataset.score(y_test, predictions) + +.. _you have already installed them: install.html#additional-dependencies +.. _MLPipeline class: ../api_reference.html#mlblocks.MLPipeline +.. _get_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_hyperparameters +.. _hyperparameters: ../advanced_usage/hyperparameters.html +.. _MLBlocks JSON Annotations: ../advanced_usage/primitives.html#json-annotations +.. _get_tunable_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_tunable_hyperparameters +.. _BTB: https://github.com/HDI-Project/BTB +.. _set_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.set_hyperparameters From c7194847264d2b85e183073aafd401ea8367c8ba Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 17 Feb 2020 13:40:59 -0500 Subject: [PATCH 096/160] Update quickstart description --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0f54b440..6fb2f56c 100644 --- a/README.md +++ b/README.md @@ -79,8 +79,10 @@ pip install mlprimitives # Quickstart -Below there is a short example about how to use **MLBlocks** to solve a prediction problem -using the primitives and pipelines from [MLPrimitives](https://github.com/HDI-Project/MLPrimitives). +Below there is a short example about how to use **MLBlocks** to solve the [Adult Census +Dataset](https://archive.ics.uci.edu/ml/datasets/Adult) classification problem using a +pipeline which combines primitives from [MLPrimitives](https://github.com/HDI-Project/MLPrimitives), +[scikit-learn](https://scikit-learn.org/) and [xgboost](https://xgboost.readthedocs.io/). ```python3 from mlblocks import MLPipeline From 78a47d6cda812406b48c71ba62cb7d5c34d74250 Mon Sep 17 00:00:00 2001 From: JDTheRipperPC Date: Thu, 20 Feb 2020 12:10:45 +0100 Subject: [PATCH 097/160] Add Development status badge --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 6fb2f56c..fa4260d5 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Pipelines and Primitives for Machine Learning and Data Science.

+[![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) [![PyPi](https://img.shields.io/pypi/v/mlblocks.svg)](https://pypi.python.org/pypi/mlblocks) [![Travis](https://travis-ci.org/HDI-Project/MLBlocks.svg?branch=master)](https://travis-ci.org/HDI-Project/MLBlocks) [![CodeCov](https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/HDI-Project/MLBlocks) @@ -21,6 +22,7 @@ Pipelines and Primitives for Machine Learning and Data Science. # MLBlocks * Free software: [MIT license](https://github.com/HDI-Project/MLBlocks/blob/master/LICENSE) +* Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) * Documentation: https://HDI-Project.github.io/MLBlocks * Homepage: https://github.com/HDI-Project/MLBlocks From 0a9205b08c426e1e3d63fd75a0fc39c855fa176f Mon Sep 17 00:00:00 2001 From: Erica Chiu Date: Sun, 23 Feb 2020 22:42:46 -0500 Subject: [PATCH 098/160] Add diagram --- mlblocks/mlpipeline.py | 183 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 177 insertions(+), 6 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index dcfc8a0b..051e8338 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -8,6 +8,7 @@ import warnings from collections import Counter, OrderedDict, defaultdict from copy import deepcopy +from graphviz import Digraph import numpy as np @@ -250,8 +251,7 @@ def _get_str_output(self, output): def get_inputs(self, fit=True): """Get a relation of all the input variables required by this pipeline. - The result is a dictionary that maps each variable name with their - specified information. + The result is a list contains all of the input variables. Optionally include the fit arguments. Args: @@ -259,9 +259,8 @@ def get_inputs(self, fit=True): Optional argument to include fit arguments or not. Defaults to ``True``. Returns: - dictionary: - A dictionary mapping every input variable's name to a dictionary - specifying the information corresponding to that input variable. + list: + List of dictionaries specifying all the input variables. Each dictionary contains the entry ``name``, as well as any other metadata that may have been included in the pipeline inputs specification. @@ -292,7 +291,19 @@ def get_inputs(self, fit=True): ) inputs.update(fit_inputs) - return inputs + inputs_list=[] + if 'X' in inputs: + inputs_list.append(inputs['X']) + del inputs['X'] + + if 'y' in inputs: + inputs_list.append(inputs['y']) + del inputs['y'] + + for input_value in inputs.values(): + inputs_list.append(input_value) + + return inputs_list def get_outputs(self, outputs='default'): """Get the list of output variables that correspond to the specified outputs. @@ -857,6 +868,166 @@ def to_dict(self): 'outputs': self.outputs, } + def _get_simple_block_name(self, block_name): + full_name = block_name.split("#")[0] + simple_name = full_name.split(".")[-1] + return simple_name + + def _get_context_name_from_variable(self, variable_name): + block_name = variable_name.split("#")[0] + rest = variable_name[len(block_name)+1:] + block_index = rest.split(".")[0] + context_name = rest[len(block_index)+1:] + if len(context_name) == 0: + raise ValueError("Invalid variable name") + return context_name + + + def get_diagram(self, fit=True, outputs='default', image_path=None): + """ + Creates a png diagram for the pipeline, showing Pipeline Steps, + Pipeline Inputs and Outputs, and block inputs and outputs. + + If strings are given, they can either be one of the named outputs that have + been specified on the pipeline definition or a full variable specification + following the format ``{block-name}.{variable-name}``. + + Args: + fit (bool): + Optional argument to include fit arguments or not. Defaults to `True`. + + outputs (str, int, or list[str or int]): + Single or list of output specifications. + + image_path (str): + Optional argument for the location at which to save the file. + Defaults to `None`, which returns a `graphviz.Digraph` object instead of saving the file. + + Returns: + None or `graphviz.Digraph` object: + * `graphviz.Digraph` contains the information about the Pipeline Diagram + """ + + diagram = Digraph(format='png') + diagram.attr('graph', splines='ortho') + diagram.attr('node', shape='box', penwidth='1') + + # Blocks + for block_name in self.blocks.keys(): + simple_name = self._get_simple_block_name(block_name) + diagram.node(block_name, simple_name) + + variables = {} + + # Inputs + inputs = self.get_inputs(fit) + input_variables = [] + with diagram.subgraph(name="cluster_inputs") as cluster: + cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0') + cluster.attr('node', penwidth='0', fontsize='24') + cluster.node('Input', 'Input', fontsize='14') + cluster.attr('edge', penwidth='0', arrowhead='none') + for input_value in inputs: + input_name = input_value['name'] + variables[input_name] = input_name+'_input' + input_variables.append(input_name) + cluster.node(variables[input_name], input_name) + cluster.edge('Input', variables[input_name]) + + with cluster.subgraph() as input_variables_subgraph: + input_variables_subgraph.attr(None, rank='same') + for index in range(1, len(input_variables)): + input_variables_subgraph.edge(variables[input_variables[index-1]], + variables[input_variables[index]]) + input_variables_subgraph.attr(None, rankdir='LR') + + # Outputs + outputs = self.get_outputs(outputs) + output_variables = [] + with diagram.subgraph(name="cluster_outputs") as cluster: + cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0') + cluster.attr('node', penwidth='0', fontsize='24') + cluster.node('Output', 'Output', fontsize='14') + cluster.attr('edge', penwidth='0', arrowhead='none') + for output in outputs: + try: + variable_name = self._get_context_name_from_variable(output['variable']) + except ValueError: + raise NotImplementedError('Can not deal with this type of output specification') + cluster.node(variable_name+'_output', variable_name) + output_variables.append(variable_name) + cluster.edge(output_variables[-1] + '_output', 'Output') + with cluster.subgraph() as output_variables_subgraph: + output_variables_subgraph.attr(None, rank='same') + for index in range(1, len(output_variables)): + output_variables_subgraph.edge(output_variables[index-1]+'_output', output_variables[index]+'_output') + output_variables_subgraph.attr(None, rankdir='LR') + + cluster_edges = set() + + # Variables + diagram.attr('node', fontsize='14', penwidth='0') + diagram.attr('edge', penwidth='1') + for block_name, block in self.blocks.items(): + # Inputs + input_names = self.input_names.get(block_name, dict()) + input_variables = block.produce_args + if fit: + for input_variable in block.fit_args: + if input_variable not in input_variables: + input_variables.append(input_variable) + for input_variable in input_variables: + input_variable_name = input_variable['name'] + if input_variable_name in input_names: + diagram.node(block_name+' '+input_variable_name, '('+input_variable_name+')', fontcolor='blue') + original_variable_name = input_names[input_variable_name] + diagram.edge(variables[original_variable_name], block_name+' '+input_variable_name) + cluster_edges.add((block_name+' '+input_variable_name, block_name)) + else: + diagram.edge(variables[input_variable_name], block_name) + + # Outputs + output_names = self.output_names.get(block_name, dict()) + for output_variable in block.produce_output: + output_variable_name = output_variable['name'] + if output_variable_name in output_names: + diagram.node(block_name+' '+output_variable_name, '('+output_variable_name+')', fontcolor='red') + cluster_edges.add((block_name, block_name+' '+output_variable_name)) + new_variable_name = output_names[output_variable_name] + diagram.node(block_name+' '+new_variable_name, new_variable_name) + diagram.edge(block_name+' '+output_variable_name, block_name+' '+new_variable_name, arrowhead='none') + variables[new_variable_name] = block_name+' '+new_variable_name + else: + diagram.node(block_name+' '+output_variable_name, output_variable_name) + diagram.edge(block_name, block_name+' '+output_variable_name, arrowhead='none') + variables[output_variable_name] = block_name+' '+output_variable_name + + # Connection to output variables + for output_variable in output_variables: + variable_block = variables[output_variable] + diagram.edge(variable_block, output_variable+'_output') + + # Alignment + with diagram.subgraph() as alignment: + alignment.attr('graph', penwidth='0') + alignment.attr('edge', penwidth='0', arrowhead='none') + for index in range(1, len(self.blocks)): + alignment.edge(self._get_block_name(index-1), self._get_block_name(index)) + + # Optional names + alignment.attr('edge', len='1', minlen='1', penwidth='1') + + for first_block, second_block in cluster_edges: + with alignment.subgraph(name='cluster_'+first_block+second_block) as cluster: + cluster.edge(first_block, second_block) + + if image_path: + diagram.render(filename='Diagram', directory=image_path, cleanup=True, format='png') + else: + return diagram + + + def save(self, path): """Save the specification of this MLPipeline in a JSON file. From 248ccd7b4bdb5f5e9783283ec92b3f0c78d88dff Mon Sep 17 00:00:00 2001 From: Erica Chiu Date: Tue, 17 Mar 2020 11:24:48 -0700 Subject: [PATCH 099/160] Setup update --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1e8ef2ad..93b78ac3 100644 --- a/setup.py +++ b/setup.py @@ -15,6 +15,7 @@ install_requires = [ + 'graphviz>=0.9,<1', ] @@ -44,7 +45,6 @@ 'm2r>=0.2.0', 'Sphinx>=1.7.1', 'sphinx_rtd_theme>=0.2.4', - 'graphviz>=0.9', 'ipython>=6.5.0', 'matplotlib>=2.2.3', 'autodocsumm>=0.1.10', From 0fd635c249f555217e2cf5d0fa171571a558d718 Mon Sep 17 00:00:00 2001 From: Erica Chiu Date: Sat, 23 May 2020 16:54:15 -0700 Subject: [PATCH 100/160] Fix test and lint errors --- mlblocks/mlpipeline.py | 369 ++++++++++++++++++++++++++++++----------- 1 file changed, 268 insertions(+), 101 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 051e8338..128932f6 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -260,7 +260,7 @@ def get_inputs(self, fit=True): Returns: list: - List of dictionaries specifying all the input variables. + Dictionary specifying all the input variables. Each dictionary contains the entry ``name``, as well as any other metadata that may have been included in the pipeline inputs specification. @@ -291,19 +291,7 @@ def get_inputs(self, fit=True): ) inputs.update(fit_inputs) - inputs_list=[] - if 'X' in inputs: - inputs_list.append(inputs['X']) - del inputs['X'] - - if 'y' in inputs: - inputs_list.append(inputs['y']) - del inputs['y'] - - for input_value in inputs.values(): - inputs_list.append(input_value) - - return inputs_list + return inputs def get_outputs(self, outputs='default'): """Get the list of output variables that correspond to the specified outputs. @@ -869,67 +857,80 @@ def to_dict(self): } def _get_simple_block_name(self, block_name): + """ + Gets the most readable, simplest version of the block name, + without the number of the block or excess modifiers. + + Args: + block_name (str): + Name of the block whose simple name is being extracted. + Returns: + str: + block name stripped of number and other modifiers. + """ full_name = block_name.split("#")[0] simple_name = full_name.split(".")[-1] return simple_name def _get_context_name_from_variable(self, variable_name): + """ + Gets the name of the context from the given variable. + + Args: + variable_name (str): + Name of the variable. + Returns: + str: + Name of the context of the variable. + """ block_name = variable_name.split("#")[0] - rest = variable_name[len(block_name)+1:] + rest = variable_name[len(block_name) + 1:] block_index = rest.split(".")[0] - context_name = rest[len(block_index)+1:] + context_name = rest[len(block_index) + 1:] if len(context_name) == 0: raise ValueError("Invalid variable name") return context_name - - def get_diagram(self, fit=True, outputs='default', image_path=None): + def _make_diagram_blocks(self, diagram): """ - Creates a png diagram for the pipeline, showing Pipeline Steps, - Pipeline Inputs and Outputs, and block inputs and outputs. - - If strings are given, they can either be one of the named outputs that have - been specified on the pipeline definition or a full variable specification - following the format ``{block-name}.{variable-name}``. + Modifies the diagram to add blocks of the pipeline as visible nodes in the diagram. Args: - fit (bool): - Optional argument to include fit arguments or not. Defaults to `True`. - - outputs (str, int, or list[str or int]): - Single or list of output specifications. - - image_path (str): - Optional argument for the location at which to save the file. - Defaults to `None`, which returns a `graphviz.Digraph` object instead of saving the file. - - Returns: - None or `graphviz.Digraph` object: - * `graphviz.Digraph` contains the information about the Pipeline Diagram + diagram (Digraph): + Diagram to be modified. """ - - diagram = Digraph(format='png') - diagram.attr('graph', splines='ortho') diagram.attr('node', shape='box', penwidth='1') - - # Blocks for block_name in self.blocks.keys(): simple_name = self._get_simple_block_name(block_name) diagram.node(block_name, simple_name) - variables = {} + def _make_diagram_inputs(self, diagram, fit): + """ + Modifies the diagram to add the inputs of the pipeline - # Inputs - inputs = self.get_inputs(fit) + Args: + diagram (Digraph): + Diagram to be modified. + + fit (bool): + `True` if including fitted arguments, `False` otherwise. + + Returns: + dict: + Dictionary of variables mapped to their label for their node in the pipeline. + """ + diagram.attr('node', shape='box') + variables = {} input_variables = [] + inputs = self.get_inputs(fit) + with diagram.subgraph(name="cluster_inputs") as cluster: cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0') cluster.attr('node', penwidth='0', fontsize='24') - cluster.node('Input', 'Input', fontsize='14') cluster.attr('edge', penwidth='0', arrowhead='none') - for input_value in inputs: - input_name = input_value['name'] - variables[input_name] = input_name+'_input' + cluster.node('Input', 'Input', fontsize='14') + for input_name in inputs: + variables[input_name] = input_name + '_input' input_variables.append(input_name) cluster.node(variables[input_name], input_name) cluster.edge('Input', variables[input_name]) @@ -937,97 +938,263 @@ def get_diagram(self, fit=True, outputs='default', image_path=None): with cluster.subgraph() as input_variables_subgraph: input_variables_subgraph.attr(None, rank='same') for index in range(1, len(input_variables)): - input_variables_subgraph.edge(variables[input_variables[index-1]], - variables[input_variables[index]]) + input_variables_subgraph.edge( + variables[input_variables[index - 1]], + variables[input_variables[index]]) input_variables_subgraph.attr(None, rankdir='LR') - # Outputs - outputs = self.get_outputs(outputs) + return variables + + def _make_diagram_outputs(self, diagram, outputs): + """ + Modifies the diagram to add outputs of the pipeline in order from left to right. + + Args: + diagram (Digraph): + Diagram to be modified. + + outputs (str, int, or list[str or int]): + Single or list of output specifications. + + Returns: + list[str]: + List of the human-readable names of the output variables in order + """ + diagram.attr('node', shape='box') output_variables = [] + outputs_vars = self.get_outputs(outputs) + with diagram.subgraph(name="cluster_outputs") as cluster: cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0') cluster.attr('node', penwidth='0', fontsize='24') - cluster.node('Output', 'Output', fontsize='14') cluster.attr('edge', penwidth='0', arrowhead='none') - for output in outputs: + cluster.node('Output', 'Output', fontsize='14') + for output in outputs_vars: try: variable_name = self._get_context_name_from_variable(output['variable']) except ValueError: - raise NotImplementedError('Can not deal with this type of output specification') - cluster.node(variable_name+'_output', variable_name) + raise NotImplementedError( + 'Can not deal with this type of output specification') + cluster.node(variable_name + '_output', variable_name) output_variables.append(variable_name) cluster.edge(output_variables[-1] + '_output', 'Output') with cluster.subgraph() as output_variables_subgraph: output_variables_subgraph.attr(None, rank='same') for index in range(1, len(output_variables)): - output_variables_subgraph.edge(output_variables[index-1]+'_output', output_variables[index]+'_output') + output_variables_subgraph.edge(output_variables[index - 1] + '_output', + output_variables[index] + '_output') output_variables_subgraph.attr(None, rankdir='LR') - cluster_edges = set() + return output_variables - # Variables + def _make_diagram_variables(self, diagram, fit, variables): + """ + Modifies the diagram to add main variables of the pipeline. + + Args: + diagram (Digraph): + Diagram to be modified + + fit (bool): + `True` if including fitted arguments, `False` otherwise. + + variables (dict): + Dictionary of variables mapped to their label for their node in the pipeline. + + Returns: + set: + Set of tuples of the alternative variable name and its corresponding block + in order + """ diagram.attr('node', fontsize='14', penwidth='0') diagram.attr('edge', penwidth='1') + cluster_edges = set() + for block_name, block in self.blocks.items(): - # Inputs - input_names = self.input_names.get(block_name, dict()) - input_variables = block.produce_args - if fit: - for input_variable in block.fit_args: - if input_variable not in input_variables: - input_variables.append(input_variable) - for input_variable in input_variables: - input_variable_name = input_variable['name'] - if input_variable_name in input_names: - diagram.node(block_name+' '+input_variable_name, '('+input_variable_name+')', fontcolor='blue') - original_variable_name = input_names[input_variable_name] - diagram.edge(variables[original_variable_name], block_name+' '+input_variable_name) - cluster_edges.add((block_name+' '+input_variable_name, block_name)) - else: - diagram.edge(variables[input_variable_name], block_name) - - # Outputs - output_names = self.output_names.get(block_name, dict()) - for output_variable in block.produce_output: - output_variable_name = output_variable['name'] - if output_variable_name in output_names: - diagram.node(block_name+' '+output_variable_name, '('+output_variable_name+')', fontcolor='red') - cluster_edges.add((block_name, block_name+' '+output_variable_name)) - new_variable_name = output_names[output_variable_name] - diagram.node(block_name+' '+new_variable_name, new_variable_name) - diagram.edge(block_name+' '+output_variable_name, block_name+' '+new_variable_name, arrowhead='none') - variables[new_variable_name] = block_name+' '+new_variable_name - else: - diagram.node(block_name+' '+output_variable_name, output_variable_name) - diagram.edge(block_name, block_name+' '+output_variable_name, arrowhead='none') - variables[output_variable_name] = block_name+' '+output_variable_name + self._make_diagram_variables_input_block(diagram, fit, variables, cluster_edges, block, + block_name) + self._make_diagram_variables_output_block(diagram, variables, cluster_edges, block, + block_name) + return cluster_edges + + def _make_diagram_variables_input_block(self, diagram, fit, variables, cluster_edges, block, + block_name): + """ + Modifies the diagram to add input variables the corresponding block of the pipeline. + + Args: + diagram (Digraph): + Diagram to be modified + + fit (bool): + `True` if including fitted arguments, `False` otherwise. + + variables (dict): + Dictionary of variables mapped to their label for their node in the pipeline. + + cluster_edges (set): + Set of tuples that may contain some alternative variable names and its + corresponding block in order + + block (MLBlock): + The block to add its input variables to the diagram + + block_name (str): + The name of the block to add its input variables to the diagram + + Returns: + set: + Set of tuples of the alternative variable name and its corresponding block + in order + """ + input_names = self.input_names.get(block_name, dict()) + input_variables = block.produce_args + + if fit: + for input_variable in block.fit_args: + if input_variable not in input_variables: + input_variables.append(input_variable) + for input_variable in input_variables: + input_variable_name = input_variable['name'] + if input_variable_name in input_names: + diagram.node(block_name + ' ' + input_variable_name, + '(' + input_variable_name + ')', fontcolor='blue') + original_variable_name = input_names[input_variable_name] + diagram.edge(variables[original_variable_name], + block_name + ' ' + input_variable_name) + cluster_edges.add((block_name + ' ' + input_variable_name, block_name)) + else: + diagram.edge(variables[input_variable_name], block_name) + + def _make_diagram_variables_output_block(self, diagram, variables, cluster_edges, block, + block_name): + """ + Modifies the diagram to add output variables the corresponding block of the pipeline. + + Args: + diagram (Digraph): + Diagram to be modified + + fit (bool): + `True` if including fitted arguments, `False` otherwise. + + variables (dict): + Dictionary of variables mapped to their label for their node in the pipeline. + + cluster_edges (set): + Set of tuples that may contain some alternative variable names and its + corresponding block in order + + block (MLBlock): + The block to add its output variables to the diagram + + block_name (str): + The name of the block to add its output variables to the diagram + + Returns: + set: + Set of tuples of the alternative variable name and its corresponding block + in order + """ + output_names = self.output_names.get(block_name, dict()) + for output_variable in block.produce_output: + output_variable_name = output_variable['name'] + if output_variable_name in output_names: + diagram.node(block_name + ' ' + output_variable_name, + '(' + output_variable_name + ')', fontcolor='red') + cluster_edges.add((block_name, block_name + ' ' + output_variable_name)) + new_variable_name = output_names[output_variable_name] + diagram.node(block_name + ' ' + new_variable_name, new_variable_name) + diagram.edge(block_name + ' ' + output_variable_name, + block_name + ' ' + new_variable_name, arrowhead='none') + variables[new_variable_name] = block_name + ' ' + new_variable_name + else: + diagram.node(block_name + ' ' + output_variable_name, output_variable_name) + diagram.edge(block_name, block_name + ' ' + output_variable_name, arrowhead='none') + variables[output_variable_name] = block_name + ' ' + output_variable_name + + def _make_diagram_output_connections(self, diagram, variables, output_variables): + """ + Modifies the diagram to add connections to the output variables of the pipeline. + + Args: + diagram (Digraph): + Diagram to be modified - # Connection to output variables + variables (dict): + Dictionary of variables mapped to their label for their node in the pipeline. + + output_variables (list[str]): + List of the human-readable names of the output variables in order + """ for output_variable in output_variables: variable_block = variables[output_variable] - diagram.edge(variable_block, output_variable+'_output') + diagram.edge(variable_block, output_variable + '_output') + + def _make_diagram_alignment(self, diagram, cluster_edges): + """ + Modifies the diagram to add alignment edges and connect alternative names to the blocks. + + Args: + diagram (Digraph): + Diagram to be modified - # Alignment + cluster_edges (set): + Set of tuples that contain alternative variable names and its + corresponding block in order + """ with diagram.subgraph() as alignment: alignment.attr('graph', penwidth='0') alignment.attr('edge', penwidth='0', arrowhead='none') for index in range(1, len(self.blocks)): - alignment.edge(self._get_block_name(index-1), self._get_block_name(index)) + alignment.edge(self._get_block_name(index - 1), self._get_block_name(index)) - # Optional names alignment.attr('edge', len='1', minlen='1', penwidth='1') - for first_block, second_block in cluster_edges: - with alignment.subgraph(name='cluster_'+first_block+second_block) as cluster: + with alignment.subgraph(name='cluster_' + first_block + second_block) as cluster: cluster.edge(first_block, second_block) + def get_diagram(self, fit=True, outputs='default', image_path=None): + """ + Creates a png diagram for the pipeline, showing Pipeline Steps, + Pipeline Inputs and Outputs, and block inputs and outputs. + + If strings are given, they can either be one of the named outputs that have + been specified on the pipeline definition or a full variable specification + following the format ``{block-name}.{variable-name}``. + + Args: + fit (bool): + Optional argument to include fit arguments or not. Defaults to `True`. + + outputs (str, int, or list[str or int]): + Single or list of output specifications. + + image_path (str): + Optional argument for the location at which to save the file. + Defaults to `None`, which returns a `graphviz.Digraph` object instead of + saving the file. + + Returns: + None or `graphviz.Digraph` object: + * `graphviz.Digraph` contains the information about the Pipeline Diagram + """ + + diagram = Digraph(format='png') + diagram.attr('graph', splines='ortho') + + self._make_diagram_blocks(diagram) + variables = self._make_diagram_inputs(diagram, fit) + output_variables = self._make_diagram_outputs(diagram, outputs) + cluster_edges = self._make_diagram_variables(diagram, fit, variables) + self._make_diagram_output_connections(diagram, variables, output_variables) + self._make_diagram_alignment(diagram, cluster_edges) + if image_path: diagram.render(filename='Diagram', directory=image_path, cleanup=True, format='png') else: return diagram - - def save(self, path): """Save the specification of this MLPipeline in a JSON file. From d47e339496b72fb13472fa68e4044f978a9cf0a4 Mon Sep 17 00:00:00 2001 From: Erica Chiu Date: Tue, 26 May 2020 10:15:32 -0700 Subject: [PATCH 101/160] Fix import order --- mlblocks/mlpipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 128932f6..a96995be 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -8,9 +8,9 @@ import warnings from collections import Counter, OrderedDict, defaultdict from copy import deepcopy -from graphviz import Digraph import numpy as np +from graphviz import Digraph from mlblocks.discovery import load_pipeline from mlblocks.mlblock import MLBlock From 62c310e72b10779544d993bb172be708e8095ccd Mon Sep 17 00:00:00 2001 From: Erica Chiu Date: Tue, 26 May 2020 11:36:11 -0700 Subject: [PATCH 102/160] Fix double arrow bug --- mlblocks/mlpipeline.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index a96995be..48d68268 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -864,6 +864,7 @@ def _get_simple_block_name(self, block_name): Args: block_name (str): Name of the block whose simple name is being extracted. + Returns: str: block name stripped of number and other modifiers. @@ -879,6 +880,7 @@ def _get_context_name_from_variable(self, variable_name): Args: variable_name (str): Name of the variable. + Returns: str: Name of the context of the variable. @@ -926,7 +928,7 @@ def _make_diagram_inputs(self, diagram, fit): with diagram.subgraph(name="cluster_inputs") as cluster: cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0') - cluster.attr('node', penwidth='0', fontsize='24') + cluster.attr('node', penwidth='0', fontsize='20') cluster.attr('edge', penwidth='0', arrowhead='none') cluster.node('Input', 'Input', fontsize='14') for input_name in inputs: @@ -966,7 +968,7 @@ def _make_diagram_outputs(self, diagram, outputs): with diagram.subgraph(name="cluster_outputs") as cluster: cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0') - cluster.attr('node', penwidth='0', fontsize='24') + cluster.attr('node', penwidth='0', fontsize='20') cluster.attr('edge', penwidth='0', arrowhead='none') cluster.node('Output', 'Output', fontsize='14') for output in outputs_vars: @@ -1048,23 +1050,23 @@ def _make_diagram_variables_input_block(self, diagram, fit, variables, cluster_e in order """ input_names = self.input_names.get(block_name, dict()) - input_variables = block.produce_args + input_variables = set(variable['name'] for variable in block.produce_args) if fit: for input_variable in block.fit_args: - if input_variable not in input_variables: - input_variables.append(input_variable) + if input_variable['name'] not in input_variables: + input_variables.add(input_variable['name']) + for input_variable in input_variables: - input_variable_name = input_variable['name'] - if input_variable_name in input_names: - diagram.node(block_name + ' ' + input_variable_name, - '(' + input_variable_name + ')', fontcolor='blue') - original_variable_name = input_names[input_variable_name] + if input_variable in input_names: + diagram.node(block_name + ' ' + input_variable, + '(' + input_variable + ')', fontcolor='blue') + original_variable_name = input_names[input_variable] diagram.edge(variables[original_variable_name], - block_name + ' ' + input_variable_name) - cluster_edges.add((block_name + ' ' + input_variable_name, block_name)) + block_name + ' ' + input_variable) + cluster_edges.add((block_name + ' ' + input_variable, block_name)) else: - diagram.edge(variables[input_variable_name], block_name) + diagram.edge(variables[input_variable], block_name) def _make_diagram_variables_output_block(self, diagram, variables, cluster_edges, block, block_name): @@ -1145,10 +1147,6 @@ def _make_diagram_alignment(self, diagram, cluster_edges): """ with diagram.subgraph() as alignment: alignment.attr('graph', penwidth='0') - alignment.attr('edge', penwidth='0', arrowhead='none') - for index in range(1, len(self.blocks)): - alignment.edge(self._get_block_name(index - 1), self._get_block_name(index)) - alignment.attr('edge', len='1', minlen='1', penwidth='1') for first_block, second_block in cluster_edges: with alignment.subgraph(name='cluster_' + first_block + second_block) as cluster: From a3394b55aa505771a229676991541ea66cc4c226 Mon Sep 17 00:00:00 2001 From: Erica Chiu Date: Tue, 26 May 2020 17:06:01 -0700 Subject: [PATCH 103/160] Edit tooltips --- mlblocks/mlpipeline.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 48d68268..81c3fc19 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -927,10 +927,11 @@ def _make_diagram_inputs(self, diagram, fit): inputs = self.get_inputs(fit) with diagram.subgraph(name="cluster_inputs") as cluster: + cluster.attr(tooltip='Input variables') cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0') cluster.attr('node', penwidth='0', fontsize='20') cluster.attr('edge', penwidth='0', arrowhead='none') - cluster.node('Input', 'Input', fontsize='14') + cluster.node('Input', 'Input', fontsize='14', tooltip='Input variables') for input_name in inputs: variables[input_name] = input_name + '_input' input_variables.append(input_name) @@ -967,10 +968,11 @@ def _make_diagram_outputs(self, diagram, outputs): outputs_vars = self.get_outputs(outputs) with diagram.subgraph(name="cluster_outputs") as cluster: + cluster.attr(tooltip='Output variables') cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0') cluster.attr('node', penwidth='0', fontsize='20') cluster.attr('edge', penwidth='0', arrowhead='none') - cluster.node('Output', 'Output', fontsize='14') + cluster.node('Output', 'Output', fontsize='14', tooltip='Output variables') for output in outputs_vars: try: variable_name = self._get_context_name_from_variable(output['variable']) @@ -1180,6 +1182,8 @@ def get_diagram(self, fit=True, outputs='default', image_path=None): diagram = Digraph(format='png') diagram.attr('graph', splines='ortho') + diagram.attr(tooltip=' ') # hack to remove extraneous tooltips on edges + diagram.attr('edge', tooltip=' ') self._make_diagram_blocks(diagram) variables = self._make_diagram_inputs(diagram, fit) From aabab78e56b60216330ec460b8c3a3dcc040aa30 Mon Sep 17 00:00:00 2001 From: Erica Chiu Date: Wed, 27 May 2020 14:08:46 -0700 Subject: [PATCH 104/160] Fix bug with repetitive variable node names --- mlblocks/mlpipeline.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 81c3fc19..af00e34b 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -1061,12 +1061,13 @@ def _make_diagram_variables_input_block(self, diagram, fit, variables, cluster_e for input_variable in input_variables: if input_variable in input_names: - diagram.node(block_name + ' ' + input_variable, + input_variable_label = block_name + ' ' + input_variable + ' (input)' + diagram.node(input_variable_label, '(' + input_variable + ')', fontcolor='blue') original_variable_name = input_names[input_variable] diagram.edge(variables[original_variable_name], - block_name + ' ' + input_variable) - cluster_edges.add((block_name + ' ' + input_variable, block_name)) + input_variable_label) + cluster_edges.add((input_variable_label, block_name)) else: diagram.edge(variables[input_variable], block_name) @@ -1104,18 +1105,20 @@ def _make_diagram_variables_output_block(self, diagram, variables, cluster_edges for output_variable in block.produce_output: output_variable_name = output_variable['name'] if output_variable_name in output_names: - diagram.node(block_name + ' ' + output_variable_name, + output_variable_label = block_name + ' ' + output_variable_name + ' (output)' + diagram.node(output_variable_label, '(' + output_variable_name + ')', fontcolor='red') - cluster_edges.add((block_name, block_name + ' ' + output_variable_name)) + cluster_edges.add((block_name, output_variable_label)) new_variable_name = output_names[output_variable_name] diagram.node(block_name + ' ' + new_variable_name, new_variable_name) - diagram.edge(block_name + ' ' + output_variable_name, + diagram.edge(output_variable_label, block_name + ' ' + new_variable_name, arrowhead='none') variables[new_variable_name] = block_name + ' ' + new_variable_name else: - diagram.node(block_name + ' ' + output_variable_name, output_variable_name) - diagram.edge(block_name, block_name + ' ' + output_variable_name, arrowhead='none') - variables[output_variable_name] = block_name + ' ' + output_variable_name + output_variable_label = block_name + ' ' + output_variable_name + diagram.node(output_variable_label, output_variable_name) + diagram.edge(block_name, output_variable_label, arrowhead='none') + variables[output_variable_name] = output_variable_label def _make_diagram_output_connections(self, diagram, variables, output_variables): """ From ca2c973583942a1dbab72ec96f1013e4069d6995 Mon Sep 17 00:00:00 2001 From: Erica Chiu Date: Wed, 27 May 2020 22:27:44 -0700 Subject: [PATCH 105/160] Remove unnecessary nodes and edges from diagram --- mlblocks/mlpipeline.py | 339 ++++++++++++++++++++--------------------- 1 file changed, 162 insertions(+), 177 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index af00e34b..6d2738ba 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -893,22 +893,57 @@ def _get_context_name_from_variable(self, variable_name): raise ValueError("Invalid variable name") return context_name - def _make_diagram_blocks(self, diagram): + def _get_relevant_output_variables(self, block_name, block, current_output_variables): """ - Modifies the diagram to add blocks of the pipeline as visible nodes in the diagram. + Gets the output variables of the given block that are in a given set of output variables + + Args: + block_name (str): + The name of the block from which the variables are outputted + block (MLBlock): + The block from which the variables are outputted + + current_output_variables (list): + A list of possible output variables to return + + Returns: + set: + A set of strings containing the output variable name if and only if it is an + output variable of the given block and its name is in the list of possible + output variables + """ + output_alt_names = self.output_names.get(block_name, dict()) + relevant_output = set() + for block_output in block.produce_output: + output_variable_name = block_output['name'] + if output_variable_name in output_alt_names.keys(): + output_variable_name = output_alt_names[output_variable_name] + + if output_variable_name in current_output_variables: + relevant_output.add(block_output['name']) + + return relevant_output + + def _make_diagram_block(self, diagram, block_name): + """ + Modifies the diagram to add the corresponding block of the pipeline as a visible node in + the diagram. Args: diagram (Digraph): Diagram to be modified. + + block_name (str): + Name of block to be added to the diagram """ - diagram.attr('node', shape='box', penwidth='1') - for block_name in self.blocks.keys(): - simple_name = self._get_simple_block_name(block_name) - diagram.node(block_name, simple_name) + simple_name = self._get_simple_block_name(block_name) + diagram.node(block_name, simple_name, penwidth='1') - def _make_diagram_inputs(self, diagram, fit): + def _make_block_inputs(self, diagram, fit, block_name, block, cluster_edges, variable_blocks): """ - Modifies the diagram to add the inputs of the pipeline + Modifies the diagram to add the corresponding input variables to the corresponding block + and their edges as outputs to other blocks by modifying `variable_blocks`. Additionally + modifies a set of edges to add any edges between an alternative input name and this block. Args: diagram (Digraph): @@ -917,37 +952,120 @@ def _make_diagram_inputs(self, diagram, fit): fit (bool): `True` if including fitted arguments, `False` otherwise. - Returns: - dict: - Dictionary of variables mapped to their label for their node in the pipeline. + block_name (str): + Name of block whose input variables are to be added to the diagram + + block (MLBlock): + Block whose input variables are to be added to the diagram + + cluster_edges (set): + Set of edges between alternative variable names and their corresponding block + + variable_blocks (dict): + Dictionary of variable names and the set of blocks into which the variable connects """ - diagram.attr('node', shape='box') - variables = {} - input_variables = [] - inputs = self.get_inputs(fit) + input_alt_names = self.input_names.get(block_name, dict()) + input_variables = set(variable['name'] for variable in block.produce_args) + + if fit: + for input_variable in block.fit_args: + if input_variable['name'] not in input_variables: + input_variables.add(input_variable['name']) + for input_name in input_variables: + input_block = block_name + if input_name in input_alt_names: + input_variable_label = block_name + ' ' + input_name + ' (input)' + diagram.node(input_variable_label, + '(' + input_name + ')', fontcolor='blue') + cluster_edges.add((input_variable_label, block_name)) + input_name = input_alt_names[input_name] + input_block = input_variable_label + + if input_name in variable_blocks.keys(): + variable_blocks[input_name].add(input_block) + else: + variable_blocks[input_name] = {input_block} + + def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges, + variable_blocks): + """ + Modifies the diagram to add the corresponding output variables to the corresponding block + and their edges as inputs to other blocks, as well as updating `variable_blocks`. + Additionally modifies a set of edges to add any edges between an alternative output name + and this block. + + Args: + diagram (Digraph): + Diagram to be modified. + + block_name (str): + Name of block whose output variables are to be added to the diagram + + output_names (set): + Set of output variable names to be added to the diagram + + cluster_edges (set): + Set of edges between alternative variable names and their corresponding block + + variable_blocks (dict): + Dictionary of variable names and the set of blocks into which the variable connects + """ + output_alt_names = self.output_names.get(block_name, dict()) + for output_name in output_names: + output_block = block_name + if output_name in output_alt_names.keys(): + alt_variable_label = block_name + ' ' + output_name + ' (output)' + diagram.node(alt_variable_label, + '(' + output_name + ')', fontcolor='red') + cluster_edges.add((block_name, alt_variable_label)) + output_name = output_alt_names[output_name] + output_block = alt_variable_label + + output_variable_label = block_name + ' ' + output_name + diagram.node(output_variable_label, output_name) + diagram.edge(output_block, output_variable_label, arrowhead='none') + + for block in variable_blocks[output_name]: + diagram.edge(output_variable_label, block) + + del variable_blocks[output_name] + + def _make_diagram_inputs(self, diagram, input_variables_blocks): + """ + Modifies the diagram to add the inputs of the pipeline + + Args: + diagram (Digraph): + Diagram to be modified. + + input_variables_blocks (dict): + Dictionary of input variables of the pipeline and the set of blocks where the + corresponding variable is an input + """ with diagram.subgraph(name="cluster_inputs") as cluster: cluster.attr(tooltip='Input variables') cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0') cluster.attr('node', penwidth='0', fontsize='20') cluster.attr('edge', penwidth='0', arrowhead='none') cluster.node('Input', 'Input', fontsize='14', tooltip='Input variables') - for input_name in inputs: - variables[input_name] = input_name + '_input' - input_variables.append(input_name) - cluster.node(variables[input_name], input_name) - cluster.edge('Input', variables[input_name]) + input_variables = [] + for input_name, blocks in input_variables_blocks.items(): + input_name_label = input_name + '_input' + cluster.node(input_name_label, input_name) + cluster.edge('Input', input_name_label) + input_variables.append(input_name_label) + + for block in blocks: + diagram.edge(input_name_label, block, pendwith='1') with cluster.subgraph() as input_variables_subgraph: input_variables_subgraph.attr(None, rank='same') for index in range(1, len(input_variables)): - input_variables_subgraph.edge( - variables[input_variables[index - 1]], - variables[input_variables[index]]) + input_variables_subgraph.edge(input_variables[index - 1], + input_variables[index]) input_variables_subgraph.attr(None, rankdir='LR') - return variables - def _make_diagram_outputs(self, diagram, outputs): """ Modifies the diagram to add outputs of the pipeline in order from left to right. @@ -963,7 +1081,6 @@ def _make_diagram_outputs(self, diagram, outputs): list[str]: List of the human-readable names of the output variables in order """ - diagram.attr('node', shape='box') output_variables = [] outputs_vars = self.get_outputs(outputs) @@ -991,153 +1108,6 @@ def _make_diagram_outputs(self, diagram, outputs): return output_variables - def _make_diagram_variables(self, diagram, fit, variables): - """ - Modifies the diagram to add main variables of the pipeline. - - Args: - diagram (Digraph): - Diagram to be modified - - fit (bool): - `True` if including fitted arguments, `False` otherwise. - - variables (dict): - Dictionary of variables mapped to their label for their node in the pipeline. - - Returns: - set: - Set of tuples of the alternative variable name and its corresponding block - in order - """ - diagram.attr('node', fontsize='14', penwidth='0') - diagram.attr('edge', penwidth='1') - cluster_edges = set() - - for block_name, block in self.blocks.items(): - self._make_diagram_variables_input_block(diagram, fit, variables, cluster_edges, block, - block_name) - self._make_diagram_variables_output_block(diagram, variables, cluster_edges, block, - block_name) - return cluster_edges - - def _make_diagram_variables_input_block(self, diagram, fit, variables, cluster_edges, block, - block_name): - """ - Modifies the diagram to add input variables the corresponding block of the pipeline. - - Args: - diagram (Digraph): - Diagram to be modified - - fit (bool): - `True` if including fitted arguments, `False` otherwise. - - variables (dict): - Dictionary of variables mapped to their label for their node in the pipeline. - - cluster_edges (set): - Set of tuples that may contain some alternative variable names and its - corresponding block in order - - block (MLBlock): - The block to add its input variables to the diagram - - block_name (str): - The name of the block to add its input variables to the diagram - - Returns: - set: - Set of tuples of the alternative variable name and its corresponding block - in order - """ - input_names = self.input_names.get(block_name, dict()) - input_variables = set(variable['name'] for variable in block.produce_args) - - if fit: - for input_variable in block.fit_args: - if input_variable['name'] not in input_variables: - input_variables.add(input_variable['name']) - - for input_variable in input_variables: - if input_variable in input_names: - input_variable_label = block_name + ' ' + input_variable + ' (input)' - diagram.node(input_variable_label, - '(' + input_variable + ')', fontcolor='blue') - original_variable_name = input_names[input_variable] - diagram.edge(variables[original_variable_name], - input_variable_label) - cluster_edges.add((input_variable_label, block_name)) - else: - diagram.edge(variables[input_variable], block_name) - - def _make_diagram_variables_output_block(self, diagram, variables, cluster_edges, block, - block_name): - """ - Modifies the diagram to add output variables the corresponding block of the pipeline. - - Args: - diagram (Digraph): - Diagram to be modified - - fit (bool): - `True` if including fitted arguments, `False` otherwise. - - variables (dict): - Dictionary of variables mapped to their label for their node in the pipeline. - - cluster_edges (set): - Set of tuples that may contain some alternative variable names and its - corresponding block in order - - block (MLBlock): - The block to add its output variables to the diagram - - block_name (str): - The name of the block to add its output variables to the diagram - - Returns: - set: - Set of tuples of the alternative variable name and its corresponding block - in order - """ - output_names = self.output_names.get(block_name, dict()) - for output_variable in block.produce_output: - output_variable_name = output_variable['name'] - if output_variable_name in output_names: - output_variable_label = block_name + ' ' + output_variable_name + ' (output)' - diagram.node(output_variable_label, - '(' + output_variable_name + ')', fontcolor='red') - cluster_edges.add((block_name, output_variable_label)) - new_variable_name = output_names[output_variable_name] - diagram.node(block_name + ' ' + new_variable_name, new_variable_name) - diagram.edge(output_variable_label, - block_name + ' ' + new_variable_name, arrowhead='none') - variables[new_variable_name] = block_name + ' ' + new_variable_name - else: - output_variable_label = block_name + ' ' + output_variable_name - diagram.node(output_variable_label, output_variable_name) - diagram.edge(block_name, output_variable_label, arrowhead='none') - variables[output_variable_name] = output_variable_label - - def _make_diagram_output_connections(self, diagram, variables, output_variables): - """ - Modifies the diagram to add connections to the output variables of the pipeline. - - Args: - diagram (Digraph): - Diagram to be modified - - variables (dict): - Dictionary of variables mapped to their label for their node in the pipeline. - - output_variables (list[str]): - List of the human-readable names of the output variables in order - """ - for output_variable in output_variables: - variable_block = variables[output_variable] - diagram.edge(variable_block, output_variable + '_output') - def _make_diagram_alignment(self, diagram, cluster_edges): """ Modifies the diagram to add alignment edges and connect alternative names to the blocks. @@ -1152,7 +1122,9 @@ def _make_diagram_alignment(self, diagram, cluster_edges): """ with diagram.subgraph() as alignment: alignment.attr('graph', penwidth='0') + alignment.attr('node', penwidth='0') alignment.attr('edge', len='1', minlen='1', penwidth='1') + for first_block, second_block in cluster_edges: with alignment.subgraph(name='cluster_' + first_block + second_block) as cluster: cluster.edge(first_block, second_block) @@ -1187,12 +1159,25 @@ def get_diagram(self, fit=True, outputs='default', image_path=None): diagram.attr('graph', splines='ortho') diagram.attr(tooltip=' ') # hack to remove extraneous tooltips on edges diagram.attr('edge', tooltip=' ') + diagram.attr('node', shape='box', penwidth='0') - self._make_diagram_blocks(diagram) - variables = self._make_diagram_inputs(diagram, fit) output_variables = self._make_diagram_outputs(diagram, outputs) - cluster_edges = self._make_diagram_variables(diagram, fit, variables) - self._make_diagram_output_connections(diagram, variables, output_variables) + + cluster_edges = set() + variable_blocks = dict((name, {name + '_output'}) for name in output_variables) + for block_name, block in reversed(self.blocks.items()): + relevant_output_names = self._get_relevant_output_variables(block_name, block, + variable_blocks.keys()) + if len(relevant_output_names) == 0: + continue # skip this block + + self._make_diagram_block(diagram, block_name) + self._make_block_outputs(diagram, block_name, relevant_output_names, cluster_edges, + variable_blocks) + self._make_block_inputs(diagram, fit, block_name, block, cluster_edges, + variable_blocks) + + self._make_diagram_inputs(diagram, variable_blocks) self._make_diagram_alignment(diagram, cluster_edges) if image_path: From 8deb6d64324842656f98968f79ae13d0e7c3c8b9 Mon Sep 17 00:00:00 2001 From: Erica Chiu Date: Thu, 4 Jun 2020 16:56:26 -0700 Subject: [PATCH 106/160] Remove intermediate arrowheads --- mlblocks/mlpipeline.py | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 6d2738ba..2465ea5f 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -900,6 +900,7 @@ def _get_relevant_output_variables(self, block_name, block, current_output_varia Args: block_name (str): The name of the block from which the variables are outputted + block (MLBlock): The block from which the variables are outputted @@ -959,10 +960,12 @@ def _make_block_inputs(self, diagram, fit, block_name, block, cluster_edges, var Block whose input variables are to be added to the diagram cluster_edges (set): - Set of edges between alternative variable names and their corresponding block + Set of tuples representing edges between alternative variable names and their + corresponding block and the type of arrowhead variable_blocks (dict): - Dictionary of variable names and the set of blocks into which the variable connects + Dictionary of variable names and the set of tuples of blocks into which the + variable connects and the type of arrowhead to use """ input_alt_names = self.input_names.get(block_name, dict()) input_variables = set(variable['name'] for variable in block.produce_args) @@ -974,18 +977,20 @@ def _make_block_inputs(self, diagram, fit, block_name, block, cluster_edges, var for input_name in input_variables: input_block = block_name + arrowhead = 'normal' if input_name in input_alt_names: input_variable_label = block_name + ' ' + input_name + ' (input)' diagram.node(input_variable_label, '(' + input_name + ')', fontcolor='blue') - cluster_edges.add((input_variable_label, block_name)) + cluster_edges.add((input_variable_label, block_name, 'normal')) input_name = input_alt_names[input_name] input_block = input_variable_label + arrowhead = 'none' if input_name in variable_blocks.keys(): - variable_blocks[input_name].add(input_block) + variable_blocks[input_name].add((input_block, arrowhead)) else: - variable_blocks[input_name] = {input_block} + variable_blocks[input_name] = {(input_block, arrowhead)} def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges, variable_blocks): @@ -1006,10 +1011,12 @@ def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges, Set of output variable names to be added to the diagram cluster_edges (set): - Set of edges between alternative variable names and their corresponding block + Set of tuples representing edges between alternative variable names and their + corresponding block and the type of arrowhead variable_blocks (dict): - Dictionary of variable names and the set of blocks into which the variable connects + Dictionary of variable names and the set of tuples of blocks into which the + variable connects and the type of arrowhead to use """ output_alt_names = self.output_names.get(block_name, dict()) for output_name in output_names: @@ -1018,7 +1025,7 @@ def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges, alt_variable_label = block_name + ' ' + output_name + ' (output)' diagram.node(alt_variable_label, '(' + output_name + ')', fontcolor='red') - cluster_edges.add((block_name, alt_variable_label)) + cluster_edges.add((block_name, alt_variable_label, 'none')) output_name = output_alt_names[output_name] output_block = alt_variable_label @@ -1026,8 +1033,8 @@ def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges, diagram.node(output_variable_label, output_name) diagram.edge(output_block, output_variable_label, arrowhead='none') - for block in variable_blocks[output_name]: - diagram.edge(output_variable_label, block) + for block, arrow in variable_blocks[output_name]: + diagram.edge(output_variable_label, block, arrowhead=arrow) del variable_blocks[output_name] @@ -1040,8 +1047,8 @@ def _make_diagram_inputs(self, diagram, input_variables_blocks): Diagram to be modified. input_variables_blocks (dict): - Dictionary of input variables of the pipeline and the set of blocks where the - corresponding variable is an input + Dictionary of input variables of the pipeline and the set of tuples of blocks into + which the variable connects and the type of arrowhead to use """ with diagram.subgraph(name="cluster_inputs") as cluster: cluster.attr(tooltip='Input variables') @@ -1056,8 +1063,8 @@ def _make_diagram_inputs(self, diagram, input_variables_blocks): cluster.edge('Input', input_name_label) input_variables.append(input_name_label) - for block in blocks: - diagram.edge(input_name_label, block, pendwith='1') + for block, arrow in blocks: + diagram.edge(input_name_label, block, pendwith='1', arrowhead=arrow) with cluster.subgraph() as input_variables_subgraph: input_variables_subgraph.attr(None, rank='same') @@ -1125,9 +1132,9 @@ def _make_diagram_alignment(self, diagram, cluster_edges): alignment.attr('node', penwidth='0') alignment.attr('edge', len='1', minlen='1', penwidth='1') - for first_block, second_block in cluster_edges: + for first_block, second_block, arrow in cluster_edges: with alignment.subgraph(name='cluster_' + first_block + second_block) as cluster: - cluster.edge(first_block, second_block) + cluster.edge(first_block, second_block, arrowhead=arrow) def get_diagram(self, fit=True, outputs='default', image_path=None): """ @@ -1158,13 +1165,12 @@ def get_diagram(self, fit=True, outputs='default', image_path=None): diagram = Digraph(format='png') diagram.attr('graph', splines='ortho') diagram.attr(tooltip=' ') # hack to remove extraneous tooltips on edges - diagram.attr('edge', tooltip=' ') diagram.attr('node', shape='box', penwidth='0') output_variables = self._make_diagram_outputs(diagram, outputs) cluster_edges = set() - variable_blocks = dict((name, {name + '_output'}) for name in output_variables) + variable_blocks = dict((name, {(name + '_output', 'normal')}) for name in output_variables) for block_name, block in reversed(self.blocks.items()): relevant_output_names = self._get_relevant_output_variables(block_name, block, variable_blocks.keys()) From ea8bb9a25e12ee13b29f985874ef15f1f032e690 Mon Sep 17 00:00:00 2001 From: Erica Chiu Date: Thu, 4 Jun 2020 17:55:04 -0700 Subject: [PATCH 107/160] Add diagram tests --- tests/data/diagrams/diagram_fit.txt | 40 +++++++++ .../data/diagrams/diagram_multiple_blocks.txt | 44 +++++++++ tests/data/diagrams/diagram_simple.txt | 40 +++++++++ tests/test_mlpipeline.py | 90 +++++++++++++++++++ 4 files changed, 214 insertions(+) create mode 100644 tests/data/diagrams/diagram_fit.txt create mode 100644 tests/data/diagrams/diagram_multiple_blocks.txt create mode 100644 tests/data/diagrams/diagram_simple.txt diff --git a/tests/data/diagrams/diagram_fit.txt b/tests/data/diagrams/diagram_fit.txt new file mode 100644 index 00000000..7939b5e3 --- /dev/null +++ b/tests/data/diagrams/diagram_fit.txt @@ -0,0 +1,40 @@ +digraph { + graph [splines=ortho] + tooltip=" " + node [penwidth=0 shape=box] + subgraph cluster_outputs { + tooltip="Output variables" + graph [bgcolor=azure3 penwidth=0 rank=source] + node [fontsize=20 penwidth=0] + edge [arrowhead=none penwidth=0] + Output [label=Output fontsize=14 tooltip="Output variables"] + output_variable_output [label=output_variable] + output_variable_output -> Output + { + rank=same + rankdir=LR + } + } + "a_primitive#1" [label=a_primitive penwidth=1] + "a_primitive#1 output_variable" [label=output_variable] + "a_primitive#1" -> "a_primitive#1 output_variable" [arrowhead=none] + "a_primitive#1 output_variable" -> output_variable_output [arrowhead=normal] + input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1] + subgraph cluster_inputs { + tooltip="Input variables" + graph [bgcolor=azure3 penwidth=0 rank=source] + node [fontsize=20 penwidth=0] + edge [arrowhead=none penwidth=0] + Input [label=Input fontsize=14 tooltip="Input variables"] + input_variable_input [label=input_variable] + Input -> input_variable_input + { + rank=same + } + } + { + graph [penwidth=0] + node [penwidth=0] + edge [len=1 minlen=1 penwidth=1] + } +} diff --git a/tests/data/diagrams/diagram_multiple_blocks.txt b/tests/data/diagrams/diagram_multiple_blocks.txt new file mode 100644 index 00000000..3f43a108 --- /dev/null +++ b/tests/data/diagrams/diagram_multiple_blocks.txt @@ -0,0 +1,44 @@ +digraph { + graph [splines=ortho] + tooltip=" " + node [penwidth=0 shape=box] + subgraph cluster_outputs { + tooltip="Output variables" + graph [bgcolor=azure3 penwidth=0 rank=source] + node [fontsize=20 penwidth=0] + edge [arrowhead=none penwidth=0] + Output [label=Output fontsize=14 tooltip="Output variables"] + output_variable_b_output [label=output_variable_b] + output_variable_b_output -> Output + { + rank=same + rankdir=LR + } + } + "b_primitive#1" [label=b_primitive penwidth=1] + "b_primitive#1 output_variable_b" [label=output_variable_b] + "b_primitive#1" -> "b_primitive#1 output_variable_b" [arrowhead=none] + "b_primitive#1 output_variable_b" -> output_variable_b_output [arrowhead=normal] + "a_primitive#1" [label=a_primitive penwidth=1] + "a_primitive#1 output_variable_a" [label=output_variable_a] + "a_primitive#1" -> "a_primitive#1 output_variable_a" [arrowhead=none] + "a_primitive#1 output_variable_a" -> "b_primitive#1" [arrowhead=normal] + input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1] + subgraph cluster_inputs { + tooltip="Input variables" + graph [bgcolor=azure3 penwidth=0 rank=source] + node [fontsize=20 penwidth=0] + edge [arrowhead=none penwidth=0] + Input [label=Input fontsize=14 tooltip="Input variables"] + input_variable_input [label=input_variable] + Input -> input_variable_input + { + rank=same + } + } + { + graph [penwidth=0] + node [penwidth=0] + edge [len=1 minlen=1 penwidth=1] + } +} diff --git a/tests/data/diagrams/diagram_simple.txt b/tests/data/diagrams/diagram_simple.txt new file mode 100644 index 00000000..7939b5e3 --- /dev/null +++ b/tests/data/diagrams/diagram_simple.txt @@ -0,0 +1,40 @@ +digraph { + graph [splines=ortho] + tooltip=" " + node [penwidth=0 shape=box] + subgraph cluster_outputs { + tooltip="Output variables" + graph [bgcolor=azure3 penwidth=0 rank=source] + node [fontsize=20 penwidth=0] + edge [arrowhead=none penwidth=0] + Output [label=Output fontsize=14 tooltip="Output variables"] + output_variable_output [label=output_variable] + output_variable_output -> Output + { + rank=same + rankdir=LR + } + } + "a_primitive#1" [label=a_primitive penwidth=1] + "a_primitive#1 output_variable" [label=output_variable] + "a_primitive#1" -> "a_primitive#1 output_variable" [arrowhead=none] + "a_primitive#1 output_variable" -> output_variable_output [arrowhead=normal] + input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1] + subgraph cluster_inputs { + tooltip="Input variables" + graph [bgcolor=azure3 penwidth=0 rank=source] + node [fontsize=20 penwidth=0] + edge [arrowhead=none penwidth=0] + Input [label=Input fontsize=14 tooltip="Input variables"] + input_variable_input [label=input_variable] + Input -> input_variable_input + { + rank=same + } + } + { + graph [penwidth=0] + node [penwidth=0] + edge [len=1 minlen=1 penwidth=1] + } +} diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index 340a3838..9d649ad1 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -681,6 +681,96 @@ def test_get_inputs_no_fit(self): assert inputs == expected + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_diagram_simple(self): + f = open('tests/data/diagrams/diagram_simple.txt', 'r') + expected = f.read()[:-1] + f.close() + + output = [ + { + 'name': 'output_variable', + 'type': 'another_whatever', + 'variable': 'a_primitive#1.output_variable' + } + ] + + pipeline = MLPipeline(['a_primitive'], outputs={'default': output}) + pipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input_variable', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].produce_output = output + + assert str(pipeline.get_diagram()) == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_diagram_fit(self): + f = open('tests/data/diagrams/diagram_fit.txt', 'r') + expected = f.read()[:-1] + f.close() + + output = [ + { + 'name': 'output_variable', + 'type': 'another_whatever', + 'variable': 'a_primitive#1.output_variable' + } + ] + + pipeline = MLPipeline(['a_primitive'], outputs={'default': output}) + pipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input_variable', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'input_variable', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].produce_output = output + + assert str(pipeline.get_diagram()) == expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_diagram_multiple_blocks(self): + f = open('tests/data/diagrams/diagram_multiple_blocks.txt', 'r') + expected = f.read()[:-1] + f.close() + + first_output = [ + { + 'name': 'output_variable_a', + 'type': 'another_whatever', + 'variable': 'a_primitive#1.output_variable_a' + } + ] + second_output = [ + { + 'name': 'output_variable_b', + 'type': 'another_whatever', + 'variable': 'b_primitive#1.output_variable_b' + } + ] + + pipeline = MLPipeline(['a_primitive', 'b_primitive'], outputs={'default': second_output}) + pipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input_variable', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].produce_output = first_output + pipeline.blocks['b_primitive#1'].produce_args = first_output + pipeline.blocks['b_primitive#1'].produce_output = second_output + + assert str(pipeline.get_diagram()) == expected + def test_fit(self): pass From 73865035c6fac86321ea86368d515d4fed068dba Mon Sep 17 00:00:00 2001 From: sarahmish Date: Tue, 28 Jul 2020 15:52:08 +0300 Subject: [PATCH 108/160] added dictionary to record block execution time --- mlblocks/mlpipeline.py | 17 +++++++++++++++++ setup.py | 1 + 2 files changed, 18 insertions(+) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index dcfc8a0b..6fc789d4 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -8,6 +8,7 @@ import warnings from collections import Counter, OrderedDict, defaultdict from copy import deepcopy +from datetime import datetime import numpy as np @@ -223,6 +224,7 @@ def __init__(self, pipeline=None, primitives=None, init_params=None, self.set_hyperparameters(hyperparameters) self._re_block_name = re.compile(r'(^[^#]+#\d+)(\..*)?') + self.time = dict() def _get_str_output(self, output): """Get the outputs that correspond to the str specification.""" @@ -390,6 +392,18 @@ def get_output_variables(self, outputs='default'): outputs = self.get_outputs(outputs) return [output['variable'] for output in outputs] + def get_time(self): + """Get the execution time of each block. + + If called before fitting the pipeline, it will return an empty dictionary. + + Returns: + dict: + A dictionary containing the block names as keys and + the execution time in seconds as values. + """ + return self.time.copy() + def _extract_block_name(self, variable_name): return self._re_block_name.search(variable_name).group(1) @@ -616,7 +630,10 @@ def _fit_block(self, block, block_name, context): LOGGER.debug("Fitting block %s", block_name) try: fit_args = self._get_block_args(block_name, block.fit_args, context) + start = datetime.utcnow() block.fit(**fit_args) + elapsed = datetime.utcnow() - start + self.time[block_name] = elapsed.total_seconds() except Exception: if self.verbose: LOGGER.exception("Exception caught fitting MLBlock %s", block_name) diff --git a/setup.py b/setup.py index a4fcc7a3..56ab70cd 100644 --- a/setup.py +++ b/setup.py @@ -15,6 +15,7 @@ install_requires = [ + 'Keras>=2.1.6,<2.4' ] From d35544ed72850f6ed4224f16e1344039b1bfb2f1 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Tue, 28 Jul 2020 22:39:37 +0300 Subject: [PATCH 109/160] add debug argument for fit/predict --- mlblocks/mlpipeline.py | 90 ++++++++++++++++++++++++++++++------------ setup.py | 1 - 2 files changed, 65 insertions(+), 26 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 6fc789d4..8e5d0629 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -224,7 +224,6 @@ def __init__(self, pipeline=None, primitives=None, init_params=None, self.set_hyperparameters(hyperparameters) self._re_block_name = re.compile(r'(^[^#]+#\d+)(\..*)?') - self.time = dict() def _get_str_output(self, output): """Get the outputs that correspond to the str specification.""" @@ -392,18 +391,6 @@ def get_output_variables(self, outputs='default'): outputs = self.get_outputs(outputs) return [output['variable'] for output in outputs] - def get_time(self): - """Get the execution time of each block. - - If called before fitting the pipeline, it will return an empty dictionary. - - Returns: - dict: - A dictionary containing the block names as keys and - the execution time in seconds as values. - """ - return self.time.copy() - def _extract_block_name(self, variable_name): return self._re_block_name.search(variable_name).group(1) @@ -625,7 +612,7 @@ def _update_outputs(self, variable_name, output_variables, outputs, value): index = output_variables.index(variable_name) outputs[index] = deepcopy(value) - def _fit_block(self, block, block_name, context): + def _fit_block(self, block, block_name, context, debug=False): """Get the block args from the context and fit the block.""" LOGGER.debug("Fitting block %s", block_name) try: @@ -633,14 +620,21 @@ def _fit_block(self, block, block_name, context): start = datetime.utcnow() block.fit(**fit_args) elapsed = datetime.utcnow() - start - self.time[block_name] = elapsed.total_seconds() + + if debug: + debug_info = { + "elapsed": elapsed.total_seconds(), + "input": fit_args + } + return debug_info + except Exception: if self.verbose: LOGGER.exception("Exception caught fitting MLBlock %s", block_name) raise - def _produce_block(self, block, block_name, context, output_variables, outputs): + def _produce_block(self, block, block_name, context, output_variables, outputs, debug=False): """Get the block args from the context and produce the block. Afterwards, set the block outputs back into the context and update @@ -649,7 +643,9 @@ def _produce_block(self, block, block_name, context, output_variables, outputs): LOGGER.debug("Producing block %s", block_name) try: produce_args = self._get_block_args(block_name, block.produce_args, context) + start = datetime.utcnow() block_outputs = block.produce(**produce_args) + elapsed = datetime.utcnow() - start outputs_dict = self._extract_outputs(block_name, block_outputs, block.produce_output) context.update(outputs_dict) @@ -662,13 +658,21 @@ def _produce_block(self, block, block_name, context, output_variables, outputs): variable_name = '{}.{}'.format(block_name, key) self._update_outputs(variable_name, output_variables, outputs, value) + if debug: + debug_info = { + "elapsed": elapsed.total_seconds(), + "input": produce_args, + "output": outputs_dict + } + return debug_info + except Exception: if self.verbose: LOGGER.exception("Exception caught producing MLBlock %s", block_name) raise - def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): + def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs): """Fit the blocks of this pipeline. Sequentially call the ``fit`` and the ``produce`` methods of each block, @@ -698,6 +702,10 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): If given, the execution of the pipeline will start on the specified block, and all the blocks before that one will be skipped. + debug (boolean): + Debug mode, if True a dictionary containing the block names as keys and + the execution time in seconds, input, output as values is returned. + **kwargs: Any additional keyword arguments will be directly added to the context dictionary and available for the blocks. @@ -725,6 +733,10 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): if isinstance(start_, int): start_ = self._get_block_name(start_) + debug_info = None + if debug: + debug_info = defaultdict(dict) + for block_name, block in self.blocks.items(): if start_: if block_name == start_: @@ -733,10 +745,15 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): LOGGER.debug("Skipping block %s fit", block_name) continue - self._fit_block(block, block_name, context) + out = self._fit_block(block, block_name, context, debug) + if debug: + debug_info["fit"][block_name] = out if (block_name != self._last_block_name) or (block_name in output_blocks): - self._produce_block(block, block_name, context, output_variables, outputs) + out = self._produce_block( + block, block_name, context, output_variables, outputs, debug) + if debug: + debug_info["produce"][block_name] = out # We already captured the output from this block if block_name in output_blocks: @@ -746,15 +763,23 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): # outputs we are done. if output_variables is not None and not output_blocks: if len(outputs) > 1: - return tuple(outputs) + result = tuple(outputs) else: - return outputs[0] + result = outputs[0] + + if debug: + return result, debug_info + + return result + + if debug: + return debug_info if start_: # We skipped all the blocks up to the end raise ValueError('Unknown block name: {}'.format(start_)) - def predict(self, X=None, output_='default', start_=None, **kwargs): + def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs): """Produce predictions using the blocks of this pipeline. Sequentially call the ``produce`` method of each block, capturing the @@ -780,6 +805,10 @@ def predict(self, X=None, output_='default', start_=None, **kwargs): If given, the execution of the pipeline will start on the specified block, and all the blocks before that one will be skipped. + debug (boolean): + Debug mode, if True a dictionary containing the block names as keys and + the execution time in seconds, input, output as values is returned. + **kwargs: Any additional keyword arguments will be directly added to the context dictionary and available for the blocks. @@ -798,6 +827,10 @@ def predict(self, X=None, output_='default', start_=None, **kwargs): if isinstance(start_, int): start_ = self._get_block_name(start_) + debug_info = None + if debug: + debug_info = dict() + for block_name, block in self.blocks.items(): if start_: if block_name == start_: @@ -806,7 +839,9 @@ def predict(self, X=None, output_='default', start_=None, **kwargs): LOGGER.debug("Skipping block %s produce", block_name) continue - self._produce_block(block, block_name, context, output_variables, outputs) + out = self._produce_block(block, block_name, context, output_variables, outputs, debug) + if debug: + debug_info[block_name] = out # We already captured the output from this block if block_name in output_blocks: @@ -816,9 +851,14 @@ def predict(self, X=None, output_='default', start_=None, **kwargs): # outputs we are done. if not output_blocks: if len(outputs) > 1: - return tuple(outputs) + result = tuple(outputs) else: - return outputs[0] + result = outputs[0] + + if debug: + return result, debug_info + + return result if start_: # We skipped all the blocks up to the end diff --git a/setup.py b/setup.py index 56ab70cd..a4fcc7a3 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,6 @@ install_requires = [ - 'Keras>=2.1.6,<2.4' ] From f0cd86f2073e6e1c1a3efe6a0535458374bc597e Mon Sep 17 00:00:00 2001 From: sarahmish Date: Tue, 28 Jul 2020 23:03:53 +0300 Subject: [PATCH 110/160] update mlprimitive test version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a4fcc7a3..85020231 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ tests_require = [ 'pytest>=3.4.2', 'pytest-cov>=2.6.0', - 'mlprimitives>=0.2,<0.3', + 'mlprimitives>=0.2.4.dev0', 'setuptools>=41.0.0', 'numpy<1.17', 'rundoc>=0.4.3', From 2909c03289df305113eae94d41f779263d25f3f6 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Wed, 29 Jul 2020 00:46:12 +0300 Subject: [PATCH 111/160] cap sphinx --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 85020231..4048cbbb 100644 --- a/setup.py +++ b/setup.py @@ -47,8 +47,8 @@ 'watchdog>=0.8.3', # docs - 'm2r>=0.2.0', - 'Sphinx>=1.7.1', + 'm2r>=0.2.0,<0.3', + 'Sphinx>=1.7.1,<3', 'sphinx_rtd_theme>=0.2.4', 'graphviz>=0.9', 'ipython>=6.5.0', From 22a955f47a60e778b50de752f232345e13aac64b Mon Sep 17 00:00:00 2001 From: sarahmish Date: Wed, 29 Jul 2020 01:18:21 +0300 Subject: [PATCH 112/160] cap isort --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 4048cbbb..57c623f0 100644 --- a/setup.py +++ b/setup.py @@ -57,8 +57,8 @@ 'docutils<0.15,>=0.10', # botocore incompatibility with 0.15 # style check - 'flake8>=3.5.0', - 'isort>=4.3.4', + 'flake8>=3.5.0,<3.8', + 'isort>=4.3.4<5', # fix style issues 'autoflake>=1.2', # keep this after flake8 to avoid From 444f301f641e03150490ade67604e4cc9a23703b Mon Sep 17 00:00:00 2001 From: sarahmish Date: Wed, 29 Jul 2020 01:53:42 +0300 Subject: [PATCH 113/160] cap isort (properly) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 57c623f0..c5cf4015 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ # style check 'flake8>=3.5.0,<3.8', - 'isort>=4.3.4<5', + 'isort>=4.3.4,<5', # fix style issues 'autoflake>=1.2', # keep this after flake8 to avoid From e2b6eb3e0d41717579a2949598af411ddbad1a47 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Thu, 30 Jul 2020 04:15:46 +0300 Subject: [PATCH 114/160] debug dictionary passing + added debug tests --- mlblocks/mlpipeline.py | 36 +++---- tests/test_mlpipeline.py | 198 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 216 insertions(+), 18 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 8e5d0629..8367b327 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -612,7 +612,7 @@ def _update_outputs(self, variable_name, output_variables, outputs, value): index = output_variables.index(variable_name) outputs[index] = deepcopy(value) - def _fit_block(self, block, block_name, context, debug=False): + def _fit_block(self, block, block_name, context, debug=None): """Get the block args from the context and fit the block.""" LOGGER.debug("Fitting block %s", block_name) try: @@ -621,12 +621,11 @@ def _fit_block(self, block, block_name, context, debug=False): block.fit(**fit_args) elapsed = datetime.utcnow() - start - if debug: - debug_info = { + if debug is not None: + debug["fit"][block_name] = { "elapsed": elapsed.total_seconds(), "input": fit_args } - return debug_info except Exception: if self.verbose: @@ -634,7 +633,7 @@ def _fit_block(self, block, block_name, context, debug=False): raise - def _produce_block(self, block, block_name, context, output_variables, outputs, debug=False): + def _produce_block(self, block, block_name, context, output_variables, outputs, debug=None): """Get the block args from the context and produce the block. Afterwards, set the block outputs back into the context and update @@ -658,13 +657,17 @@ def _produce_block(self, block, block_name, context, output_variables, outputs, variable_name = '{}.{}'.format(block_name, key) self._update_outputs(variable_name, output_variables, outputs, value) - if debug: - debug_info = { + if debug is not None: + record = { "elapsed": elapsed.total_seconds(), "input": produce_args, "output": outputs_dict } - return debug_info + + if "fit" in debug.keys(): + debug["produce"][block_name] = record + else: + debug[block_name] = record except Exception: if self.verbose: @@ -745,15 +748,11 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs): LOGGER.debug("Skipping block %s fit", block_name) continue - out = self._fit_block(block, block_name, context, debug) - if debug: - debug_info["fit"][block_name] = out + self._fit_block(block, block_name, context, debug_info) if (block_name != self._last_block_name) or (block_name in output_blocks): - out = self._produce_block( - block, block_name, context, output_variables, outputs, debug) - if debug: - debug_info["produce"][block_name] = out + self._produce_block( + block, block_name, context, output_variables, outputs, debug_info) # We already captured the output from this block if block_name in output_blocks: @@ -839,9 +838,7 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs) LOGGER.debug("Skipping block %s produce", block_name) continue - out = self._produce_block(block, block_name, context, output_variables, outputs, debug) - if debug: - debug_info[block_name] = out + self._produce_block(block, block_name, context, output_variables, outputs, debug_info) # We already captured the output from this block if block_name in output_blocks: @@ -860,6 +857,9 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs) return result + if debug: + return debug_info + if start_: # We skipped all the blocks up to the end raise ValueError('Unknown block name: {}'.format(start_)) diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index 340a3838..25a90edb 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -681,6 +681,204 @@ def test_get_inputs_no_fit(self): assert inputs == expected + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_fit_no_debug(self): + mlpipeline = MLPipeline(['a_primitive']) + mlpipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + + returned = mlpipeline.fit(debug=False) + + assert returned is None + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_fit_debug(self): + mlpipeline = MLPipeline(['a_primitive']) + mlpipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + + expected_return = dict() + expected_return["fit"] = { + "a_primitive#1": { + "elapsed": 0, + "input": { + "whatever" + } + } + } + + returned = mlpipeline.fit(debug=True) + + print(returned) + assert isinstance(returned, dict) + assert set(returned.keys()) == set(expected_return.keys()) # fit / produce + assert set(returned["fit"].keys()) == set(expected_return["fit"].keys()) # block name + + for block_name, dictionary in expected_return["fit"].items(): + assert set(returned["fit"][block_name].keys()) == set(dictionary.keys()) + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_fit_produce_debug(self): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_primitive#1.a_variable', + 'type': 'a_type', + } + ] + } + mlpipeline = MLPipeline(['a_primitive'], outputs=outputs) + mlpipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + + mlpipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + + mlpipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'a_name', + 'type': 'a_type' + } + ] + + expected_return = dict() + expected_return["fit"] = { + "a_primitive#1": { + "elapsed": 0, + "input": { + "whatever" + } + } + } + expected_return["produce"] = { + "a_primitive#1": { + "elapsed": 0, + "input": { + "whatever" + }, + "output": { + "whatever" + } + } + } + + returned, debug_returned = mlpipeline.fit(output_='default', debug=True) + + assert len([returned]) == len(outputs["default"]) + assert isinstance(debug_returned, dict) + assert set(debug_returned.keys()) == set(expected_return.keys()) # fit / produce + assert set(debug_returned["fit"].keys()) == set(expected_return["fit"].keys()) + assert set(debug_returned["produce"].keys()) == set(expected_return["produce"].keys()) + + for block_name, dictionary in expected_return["fit"].items(): + assert set(debug_returned["fit"][block_name].keys()) == set(dictionary.keys()) + + for block_name, dictionary in expected_return["produce"].items(): + assert set(debug_returned["produce"][block_name].keys()) == set(dictionary.keys()) + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_predict_no_debug(self): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_primitive#1.a_variable', + 'type': 'a_type', + }, + { + 'name': 'b_name', + 'variable': 'a_primitive#1.b_variable', + 'type': 'b_type', + }, + ] + } + mlpipeline = MLPipeline(['a_primitive'], outputs=outputs) + mlpipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + + mlpipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'a_name', + 'type': 'a_type' + }, + { + 'name': 'b_name', + 'type': 'b_type' + } + ] + + returned = mlpipeline.predict(debug=False) + assert len(returned) == len(outputs["default"]) + for returned_output, expected_output in zip(returned, outputs["default"]): + assert returned_output == expected_output["variable"] + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_predict_debug(self): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_primitive#1.a_variable', + 'type': 'a_type', + } + ] + } + mlpipeline = MLPipeline(['a_primitive'], outputs=outputs) + mlpipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + + mlpipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'a_name', + 'type': 'a_type' + } + ] + + expected_return = dict() + expected_return = { + "a_primitive#1": { + "elapsed": 0, + "input": { + "whatever" + }, + "output": { + "whatever" + } + } + } + returned, debug_returned = mlpipeline.predict(debug=True) + assert len([returned]) == len(outputs["default"]) + assert isinstance(debug_returned, dict) + assert set(debug_returned.keys()) == set(expected_return.keys()) + + for block_name, dictionary in expected_return.items(): + assert set(debug_returned[block_name].keys()) == set(dictionary.keys()) + def test_fit(self): pass From 54c6698a6df91fe646071f5f971224beefa12f1a Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 16 Sep 2020 16:58:57 +0200 Subject: [PATCH 115/160] Update dependencies and tutorials --- .travis.yml | 6 +- Makefile | 23 +- .../tutorials/1. Using and MLPipeline.ipynb | 2 +- .../2. Finding and Loading a Pipeline.ipynb | 35 +- .... Setting MLPipeline Hyperparameters.ipynb | 2 +- ...ial execution and pipeline debugging.ipynb | 2 +- .../6. Flexible outputs specification.ipynb | 18 +- examples/tutorials/7. Tuning a Pipeline.ipynb | 46 +- ...or the best pipeline with BTBSession.ipynb | 533 +++++++----------- setup.py | 33 +- tests/test_mlpipeline.py | 3 +- tox.ini | 29 +- 12 files changed, 327 insertions(+), 405 deletions(-) diff --git a/.travis.yml b/.travis.yml index 136bd690..7c63a880 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,5 @@ # Config file for automatic testing at travis-ci.org +dist: bionic language: python python: - 3.6 @@ -6,8 +7,9 @@ python: # Command to install dependencies install: - - pip install -U tox-travis codecov - - sudo apt-get install graphviz + - sudo apt-get update + - sudo apt-get install graphviz pandoc + - pip install -U tox-travis codecov # Command to run tests script: tox diff --git a/Makefile b/Makefile index eb422682..6cc80705 100644 --- a/Makefile +++ b/Makefile @@ -110,13 +110,30 @@ lint-docs: ## check docs formatting with doc8 and pydocstyle # TEST TARGETS -.PHONY: test -test: ## run tests quickly with the default Python +.PHONY: test-unit +test-unit: ## run tests quickly with the default Python python -m pytest --cov=mlblocks .PHONY: test-readme test-readme: ## run the readme snippets - rundoc run --single-session python3 -t python3 README.md + rm -rf tests/readme_test && mkdir tests/readme_test + cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md + rm -rf tests/readme_test + +.PHONY: test-tutorials +test-tutorials: ## run the tutorial notebooks + find examples/tutorials -path "*/.ipynb_checkpoints" -prune -false -o -name "*.ipynb" -exec \ + jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --stdout --to html {} > /dev/null \; + +.PHONY: test +test: test-unit test-readme ## test everything that needs test dependencies + +.PHONY: check-dependencies +check-dependencies: ## test if there are any broken dependencies + pip check + +.PHONY: test-devel +test-devel: check-dependencies lint docs ## test everything that needs development dependencies .PHONY: test-all test-all: ## run tests on every Python version with tox diff --git a/examples/tutorials/1. Using and MLPipeline.ipynb b/examples/tutorials/1. Using and MLPipeline.ipynb index 733fb42d..dab130ea 100644 --- a/examples/tutorials/1. Using and MLPipeline.ipynb +++ b/examples/tutorials/1. Using and MLPipeline.ipynb @@ -625,7 +625,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.9" } }, "nbformat": 4, diff --git a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb index 8df76259..7f14662a 100644 --- a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb +++ b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb @@ -52,18 +52,20 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['image.classification.hog.random_forest',\n", - " 'image.classification.hog.xgboost',\n", - " 'image.classification.resnet50.xgboost']" + "['image.classification.hog.rf',\n", + " 'image.classification.hog.xgb',\n", + " 'image.classification.resnet50.xgb',\n", + " 'keras.Sequential.SingleLayerCNNImageClassifier',\n", + " 'keras.Sequential.VGGCNNClassifier']" ] }, - "execution_count": 3, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -72,7 +74,7 @@ "from mlblocks.discovery import find_pipelines\n", "\n", "filters = {\n", - " 'metadata.data_modality': 'image',\n", + " 'metadata.data_type': 'image',\n", " 'metadata.task_type': 'classification',\n", "}\n", "\n", @@ -89,13 +91,26 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n", + "2020-09-16 16:03:19,939 - WARNING - tensorflow - From /home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "If using Keras pass *_constraint arguments to layers.\n", + "2020-09-16 16:03:20,025 - WARNING - tensorflow - From /home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4070: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n", + "\n" + ] + } + ], "source": [ "from mlblocks import MLPipeline\n", "\n", - "pipeline = MLPipeline('image.classification.resnet50.xgboost')" + "pipeline = MLPipeline('image.classification.resnet50.xgb')" ] } ], @@ -115,7 +130,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.9" } }, "nbformat": 4, diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb index 725226f7..5b7944b5 100644 --- a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb +++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb @@ -204,7 +204,7 @@ " }\n", "}\n", "pipeline = MLPipeline(\n", - " 'single_table.classification.categorical_encoder.xgboost',\n", + " primitives,\n", " init_params=init_params\n", ")" ] diff --git a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb index 2e21c85b..57b2b43c 100644 --- a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb +++ b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb @@ -704,7 +704,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.9" } }, "nbformat": 4, diff --git a/examples/tutorials/6. Flexible outputs specification.ipynb b/examples/tutorials/6. Flexible outputs specification.ipynb index 3dc3686f..ca1048dd 100644 --- a/examples/tutorials/6. Flexible outputs specification.ipynb +++ b/examples/tutorials/6. Flexible outputs specification.ipynb @@ -380,7 +380,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -400,7 +400,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -418,7 +418,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -439,7 +439,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -454,7 +454,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -463,7 +463,7 @@ "(24420, 108)" ] }, - "execution_count": 28, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -474,7 +474,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -483,7 +483,7 @@ "(24420, 108)" ] }, - "execution_count": 29, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -509,7 +509,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.9" } }, "nbformat": 4, diff --git a/examples/tutorials/7. Tuning a Pipeline.ipynb b/examples/tutorials/7. Tuning a Pipeline.ipynb index 8dbc4366..4b6eae24 100644 --- a/examples/tutorials/7. Tuning a Pipeline.ipynb +++ b/examples/tutorials/7. Tuning a Pipeline.ipynb @@ -58,7 +58,7 @@ "source": [ "from mlblocks import MLPipeline\n", "\n", - "template = MLPipeline('single_table.classification.categorical_encoder.xgboost')" + "template = MLPipeline('single_table.classification.xgb')" ] }, { @@ -204,7 +204,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -213,7 +213,7 @@ "0.8686773872402614" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -238,7 +238,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -261,7 +261,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -284,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -300,7 +300,7 @@ " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -312,7 +312,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -337,7 +337,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -346,13 +346,15 @@ "text": [ "scoring pipeline 1\n", "scoring pipeline 2\n", - "New best found: 0.8722706212975673\n", "scoring pipeline 3\n", "scoring pipeline 4\n", + "New best found: 0.8642241881762839\n", "scoring pipeline 5\n", "scoring pipeline 6\n", "scoring pipeline 7\n", + "New best found: 0.8644390957265209\n", "scoring pipeline 8\n", + "New best found: 0.8679095503945804\n", "scoring pipeline 9\n", "scoring pipeline 10\n" ] @@ -386,23 +388,23 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 40,\n", - " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 119,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 4,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1971742459927317,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.22575517380871246,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 4}" + " 'max_labels'): 39,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'most_frequent',\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 70,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 6,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.07406443671152008,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9244108160038952,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" ] }, - "execution_count": 15, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -422,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -431,7 +433,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -455,7 +457,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.9" } }, "nbformat": 4, diff --git a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb index a1f0c0f4..1fb4d7ca 100644 --- a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb +++ b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb @@ -103,11 +103,7 @@ "source": [ "from mlblocks.discovery import find_pipelines\n", "\n", - "filters = {\n", - " 'metadata.data_modality': 'single_table',\n", - " 'metadata.task_type': 'classification'\n", - "}\n", - "templates = find_pipelines(filters=filters)" + "templates = find_pipelines('single_table.classification')" ] }, { @@ -118,13 +114,9 @@ { "data": { "text/plain": [ - "['single_table.classification.categorical_encoder.logit',\n", - " 'single_table.classification.categorical_encoder.random_forest',\n", - " 'single_table.classification.categorical_encoder.xgboost',\n", - " 'single_table.classification.mlprimitives.logit',\n", - " 'single_table.classification.mlprimitives.random_forest',\n", - " 'single_table.classification.mlprimitives.xgboost',\n", - " 'single_table.classification.mlprimitives_text.xgboost']" + "['single_table.classification',\n", + " 'single_table.classification.text',\n", + " 'single_table.classification.xgb']" ] }, "execution_count": 4, @@ -165,7 +157,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 6, @@ -174,7 +166,7 @@ } ], "source": [ - "templates_dict['single_table.classification.mlprimitives.xgboost']" + "templates_dict['single_table.classification.xgb']" ] }, { @@ -250,12 +242,6 @@ "text/plain": [ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", " 'max_labels'): {'type': 'int', 'default': 0, 'range': [0, 100]},\n", - " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", - " 'lowercase'): {'type': 'bool', 'default': True},\n", - " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", - " 'binary'): {'type': 'bool', 'default': True},\n", - " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", - " 'max_features'): {'type': 'int', 'default': 1000, 'range': [1, 10000]},\n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): {'type': 'str',\n", " 'default': 'mean',\n", " 'values': ['mean', 'median', 'most_frequent', 'constant']},\n", @@ -282,7 +268,7 @@ } ], "source": [ - "tunables['single_table.classification.mlprimitives.xgboost']" + "tunables['single_table.classification.xgb']" ] }, { @@ -296,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -322,13 +308,15 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": {}, + "execution_count": 11, + "metadata": { + "scrolled": false + }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "fe9bb1cfdb2f48d4b6c8614ae1d357a1", + "model_id": "342fe40f08024adcb5b60eea25f49d37", "version_major": 2, "version_minor": 0 }, @@ -343,18 +331,98 @@ "name": "stderr", "output_type": "stream", "text": [ - "2020-01-23 20:16:01,059 - INFO - session - Creating Tunable instance from dict.\n", - "2020-01-23 20:16:01,060 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.logit\n", - "2020-01-23 20:16:03,274 - INFO - session - New optimal found: single_table.classification.categorical_encoder.logit - 0.7975185708718643\n", - "2020-01-23 20:16:03,284 - INFO - session - Creating Tunable instance from dict.\n", - "2020-01-23 20:16:03,285 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.random_forest\n", - "2020-01-23 20:16:05,584 - INFO - session - Creating Tunable instance from dict.\n", - "2020-01-23 20:16:05,585 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.xgboost\n", - "2020-01-23 20:16:10,613 - INFO - session - New optimal found: single_table.classification.categorical_encoder.xgboost - 0.8639171383183359\n", - "2020-01-23 20:16:10,617 - INFO - session - Creating Tunable instance from dict.\n", - "2020-01-23 20:16:10,618 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.logit\n", - "2020-01-23 20:16:13,090 - INFO - session - Creating Tunable instance from dict.\n", - "2020-01-23 20:16:13,093 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.random_forest\n" + "2020-09-16 16:32:40,826 - INFO - btb.session - Creating Tunable instance from dict.\n", + "2020-09-16 16:32:40,827 - INFO - btb.session - Obtaining default configuration for single_table.classification\n", + "2020-09-16 16:32:46,432 - INFO - btb.session - New optimal found: single_table.classification - 0.8639171383183359\n", + "2020-09-16 16:32:46,435 - INFO - btb.session - Creating Tunable instance from dict.\n", + "2020-09-16 16:32:46,436 - INFO - btb.session - Obtaining default configuration for single_table.classification.text\n", + "2020-09-16 16:32:46,583 - ERROR - mlblocks.mlpipeline - Exception caught producing MLBlock mlprimitives.custom.text.TextCleaner#1\n", + "Traceback (most recent call last):\n", + " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n", + " return self._engine.get_loc(key)\n", + " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + "KeyError: 'text'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlpipeline.py\", line 645, in _produce_block\n", + " block_outputs = block.produce(**produce_args)\n", + " File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlblock.py\", line 322, in produce\n", + " return getattr(self.instance, self.produce_method)(**produce_kwargs)\n", + " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/mlprimitives/custom/text.py\", line 111, in produce\n", + " texts = X[self.column]\n", + " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n", + " indexer = self.columns.get_loc(key)\n", + " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n", + " return self._engine.get_loc(self._maybe_cast_indexer(key))\n", + " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + "KeyError: 'text'\n", + "2020-09-16 16:32:46,586 - ERROR - btb.session - Proposal 2 - single_table.classification.text crashed with the following configuration: ('mlprimitives.custom.text.TextCleaner#1', 'lower'): True\n", + "('mlprimitives.custom.text.TextCleaner#1', 'accents'): True\n", + "('mlprimitives.custom.text.TextCleaner#1', 'stopwords'): True\n", + "('mlprimitives.custom.text.TextCleaner#1', 'non_alpha'): True\n", + "('mlprimitives.custom.text.TextCleaner#1', 'single_chars'): True\n", + "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'lowercase'): True\n", + "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'binary'): True\n", + "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'max_features'): 1000\n", + "('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'n_estimators'): 10\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'criterion'): gini\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'max_features'): None\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'max_depth'): 1\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_split'): 2\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_leaf'): 1\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'min_weight_fraction_leaf'): 0.0\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'max_leaf_nodes'): 2\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'min_impurity_decrease'): 0.0\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True\n", + "('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False\n", + "Traceback (most recent call last):\n", + " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n", + " return self._engine.get_loc(key)\n", + " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + "KeyError: 'text'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/btb/session.py\", line 336, in run\n", + " score = self._scorer(tunable_name, config)\n", + " File \"\", line 11, in cross_validate\n", + " pipeline.fit(X_train, y_train)\n", + " File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlpipeline.py\", line 754, in fit\n", + " block, block_name, context, output_variables, outputs, debug_info)\n", + " File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlpipeline.py\", line 645, in _produce_block\n", + " block_outputs = block.produce(**produce_args)\n", + " File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlblock.py\", line 322, in produce\n", + " return getattr(self.instance, self.produce_method)(**produce_kwargs)\n", + " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/mlprimitives/custom/text.py\", line 111, in produce\n", + " texts = X[self.column]\n", + " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n", + " indexer = self.columns.get_loc(key)\n", + " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n", + " return self._engine.get_loc(self._maybe_cast_indexer(key))\n", + " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + "KeyError: 'text'\n", + "2020-09-16 16:32:46,587 - WARNING - btb.session - Too many errors: 1. Removing tunable single_table.classification.text\n", + "2020-09-16 16:32:46,589 - INFO - btb.session - Creating Tunable instance from dict.\n", + "2020-09-16 16:32:46,589 - INFO - btb.session - Obtaining default configuration for single_table.classification.xgb\n", + "2020-09-16 16:32:52,100 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n", + "2020-09-16 16:33:28,900 - INFO - btb.session - New optimal found: single_table.classification - 0.8728234138413778\n", + "2020-09-16 16:33:28,904 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n" ] }, { @@ -367,20 +435,26 @@ { "data": { "text/plain": [ - "{'id': '51a54054874dd7a83ff0e785ffdfee3b',\n", - " 'name': 'single_table.classification.categorical_encoder.xgboost',\n", + "{'id': '7e662f9b90f0e123939b7532ecc221c7',\n", + " 'name': 'single_table.classification',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 0,\n", - " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", + " 'max_labels'): 63,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'lowercase'): True,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'binary'): True,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'max_features'): 7315,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'median',\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 879,\n", " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", - " 'score': 0.8639171383183359}" + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23231879890615814,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.5474914147721585,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 3},\n", + " 'score': 0.8728234138413778}" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -411,26 +485,32 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'id': '51a54054874dd7a83ff0e785ffdfee3b',\n", - " 'name': 'single_table.classification.categorical_encoder.xgboost',\n", + "{'id': '7e662f9b90f0e123939b7532ecc221c7',\n", + " 'name': 'single_table.classification',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 0,\n", - " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", + " 'max_labels'): 63,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'lowercase'): True,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'binary'): True,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'max_features'): 7315,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'median',\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 879,\n", " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", - " 'score': 0.8639171383183359}" + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23231879890615814,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.5474914147721585,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 3},\n", + " 'score': 0.8728234138413778}" ] }, - "execution_count": 14, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -455,7 +535,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": { "scrolled": false }, @@ -463,7 +543,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a76ce44e1173496e99baaf7ee39a3df7", + "model_id": "8dd5d4626f304c279b2b368a671b6cb7", "version_major": 2, "version_minor": 0 }, @@ -478,219 +558,27 @@ "name": "stderr", "output_type": "stream", "text": [ - "2020-01-23 20:17:59,163 - INFO - session - Creating Tunable instance from dict.\n", - "2020-01-23 20:17:59,163 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.xgboost\n", - "2020-01-23 20:18:04,640 - INFO - session - Creating Tunable instance from dict.\n", - "2020-01-23 20:18:04,640 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives_text.xgboost\n", - "2020-01-23 20:18:04,779 - ERROR - mlpipeline - Exception caught producing MLBlock mlprimitives.custom.text.TextCleaner#1\n", - "Traceback (most recent call last):\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n", - " return self._engine.get_loc(key)\n", - " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n", - " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n", - " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", - " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", - "KeyError: 'text'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 635, in _produce_block\n", - " block_outputs = block.produce(**produce_args)\n", - " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 322, in produce\n", - " return getattr(self.instance, self.produce_method)(**produce_kwargs)\n", - " File \"/home/xals/Projects/MIT/MLPrimitives/mlprimitives/custom/text.py\", line 111, in produce\n", - " texts = X[self.column]\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n", - " indexer = self.columns.get_loc(key)\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n", - " return self._engine.get_loc(self._maybe_cast_indexer(key))\n", - " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n", - " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n", - " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", - " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", - "KeyError: 'text'\n", - "2020-01-23 20:18:04,799 - ERROR - session - Proposal 7 - single_table.classification.mlprimitives_text.xgboost crashed with the following configuration: ('mlprimitives.custom.text.TextCleaner#1', 'lower'): True\n", - "('mlprimitives.custom.text.TextCleaner#1', 'accents'): True\n", - "('mlprimitives.custom.text.TextCleaner#1', 'stopwords'): True\n", - "('mlprimitives.custom.text.TextCleaner#1', 'non_alpha'): True\n", - "('mlprimitives.custom.text.TextCleaner#1', 'single_chars'): True\n", - "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'lowercase'): True\n", - "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'binary'): True\n", - "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'max_features'): 1000\n", - "('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", - "('sklearn.ensemble.RandomForestClassifier#1', 'n_estimators'): 10\n", - "('sklearn.ensemble.RandomForestClassifier#1', 'criterion'): gini\n", - "('sklearn.ensemble.RandomForestClassifier#1', 'max_features'): None\n", - "('sklearn.ensemble.RandomForestClassifier#1', 'max_depth'): 1\n", - "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_split'): 2\n", - "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_leaf'): 1\n", - "('sklearn.ensemble.RandomForestClassifier#1', 'min_weight_fraction_leaf'): 0.0\n", - "('sklearn.ensemble.RandomForestClassifier#1', 'max_leaf_nodes'): 2\n", - "('sklearn.ensemble.RandomForestClassifier#1', 'min_impurity_decrease'): 0.0\n", - "('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True\n", - "('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False\n", - "Traceback (most recent call last):\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n", - " return self._engine.get_loc(key)\n", - " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n", - " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n", - " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", - " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", - "KeyError: 'text'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n", - " score = self.scorer(tunable_name, config)\n", - " File \"\", line 11, in cross_validate\n", - " pipeline.fit(X_train, y_train)\n", - " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 722, in fit\n", - " self._produce_block(block, block_name, context, output_variables, outputs)\n", - " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 635, in _produce_block\n", - " block_outputs = block.produce(**produce_args)\n", - " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 322, in produce\n", - " return getattr(self.instance, self.produce_method)(**produce_kwargs)\n", - " File \"/home/xals/Projects/MIT/MLPrimitives/mlprimitives/custom/text.py\", line 111, in produce\n", - " texts = X[self.column]\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n", - " indexer = self.columns.get_loc(key)\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n", - " return self._engine.get_loc(self._maybe_cast_indexer(key))\n", - " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n", - " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n", - " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", - " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", - "KeyError: 'text'\n", - "2020-01-23 20:18:04,801 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.mlprimitives_text.xgboost\n", - "2020-01-23 20:18:04,803 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n", - "2020-01-23 20:18:22,026 - INFO - session - New optimal found: single_table.classification.categorical_encoder.xgboost - 0.8687079630193402\n", - "2020-01-23 20:18:22,031 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n", - "2020-01-23 20:19:13,106 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.logit\n", - "2020-01-23 20:19:13,334 - ERROR - mlpipeline - Exception caught fitting MLBlock sklearn.linear_model.LogisticRegression#1\n", - "Traceback (most recent call last):\n", - " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n", - " block.fit(**fit_args)\n", - " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n", - " getattr(self.instance, self.fit_method)(**fit_kwargs)\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n", - " solver = _check_solver(self.solver, self.penalty, self.dual)\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n", - " \"got %s penalty.\" % (solver, penalty))\n", - "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n", - "2020-01-23 20:19:13,339 - ERROR - session - Proposal 10 - single_table.classification.categorical_encoder.logit crashed with the following configuration: ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 29\n", - "('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n", - "('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): False\n", - "('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 71156\n", - "('sklearn.linear_model.LogisticRegression#1', 'solver'): newton-cg\n", - "('sklearn.linear_model.LogisticRegression#1', 'penalty'): l1\n", - "('sklearn.linear_model.LogisticRegression#1', 'C'): 40.699406362214916\n", - "('sklearn.linear_model.LogisticRegression#1', 'multi_class'): multinomial\n", - "('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 933.5409791334005\n", - "('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.0017748534037681438\n", - "('sklearn.linear_model.LogisticRegression#1', 'dual'): True\n", - "Traceback (most recent call last):\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n", - " score = self.scorer(tunable_name, config)\n", - " File \"\", line 11, in cross_validate\n", - " pipeline.fit(X_train, y_train)\n", - " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 719, in fit\n", - " self._fit_block(block, block_name, context)\n", - " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n", - " block.fit(**fit_args)\n", - " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n", - " getattr(self.instance, self.fit_method)(**fit_kwargs)\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n", - " solver = _check_solver(self.solver, self.penalty, self.dual)\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n", - " \"got %s penalty.\" % (solver, penalty))\n", - "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2020-01-23 20:19:13,340 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.categorical_encoder.logit\n", - "2020-01-23 20:19:13,343 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.logit\n", - "2020-01-23 20:19:26,076 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n", - "2020-01-23 20:19:31,573 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n", - "2020-01-23 20:19:34,763 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n", - "2020-01-23 20:20:15,775 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n", - "2020-01-23 20:21:49,655 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.logit\n", - "2020-01-23 20:21:49,946 - ERROR - mlpipeline - Exception caught fitting MLBlock sklearn.linear_model.LogisticRegression#1\n", - "Traceback (most recent call last):\n", - " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n", - " block.fit(**fit_args)\n", - " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n", - " getattr(self.instance, self.fit_method)(**fit_kwargs)\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n", - " solver = _check_solver(self.solver, self.penalty, self.dual)\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n", - " \"got %s penalty.\" % (solver, penalty))\n", - "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n", - "2020-01-23 20:21:49,948 - ERROR - session - Proposal 16 - single_table.classification.mlprimitives.logit crashed with the following configuration: ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 97\n", - "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'lowercase'): True\n", - "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'binary'): True\n", - "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'max_features'): 4707\n", - "('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n", - "('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): True\n", - "('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 26014\n", - "('sklearn.linear_model.LogisticRegression#1', 'solver'): newton-cg\n", - "('sklearn.linear_model.LogisticRegression#1', 'penalty'): l1\n", - "('sklearn.linear_model.LogisticRegression#1', 'C'): 34.878827238511434\n", - "('sklearn.linear_model.LogisticRegression#1', 'multi_class'): multinomial\n", - "('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 406.1952335959628\n", - "('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.008653762646621075\n", - "('sklearn.linear_model.LogisticRegression#1', 'dual'): True\n", - "Traceback (most recent call last):\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n", - " score = self.scorer(tunable_name, config)\n", - " File \"\", line 11, in cross_validate\n", - " pipeline.fit(X_train, y_train)\n", - " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 719, in fit\n", - " self._fit_block(block, block_name, context)\n", - " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n", - " block.fit(**fit_args)\n", - " File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n", - " getattr(self.instance, self.fit_method)(**fit_kwargs)\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n", - " solver = _check_solver(self.solver, self.penalty, self.dual)\n", - " File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n", - " \"got %s penalty.\" % (solver, penalty))\n", - "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n", - "2020-01-23 20:21:49,951 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.mlprimitives.logit\n", - "2020-01-23 20:21:49,953 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n", - "2020-01-23 20:22:23,153 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n", - "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n", - " warn(\"Some inputs do not have OOB scores. \"\n", - "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n", - " predictions[k].sum(axis=1)[:, np.newaxis])\n", - "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n", - " warn(\"Some inputs do not have OOB scores. \"\n", - "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n", - " predictions[k].sum(axis=1)[:, np.newaxis])\n", - "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n", - " warn(\"Some inputs do not have OOB scores. \"\n", - "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n", - " predictions[k].sum(axis=1)[:, np.newaxis])\n", - "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n", - " warn(\"Some inputs do not have OOB scores. \"\n", - "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n", - " predictions[k].sum(axis=1)[:, np.newaxis])\n", - "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n", - " warn(\"Some inputs do not have OOB scores. \"\n", - "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n", - " predictions[k].sum(axis=1)[:, np.newaxis])\n", - "2020-01-23 20:22:24,832 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n", - "2020-01-23 20:22:46,026 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n", - "2020-01-23 20:22:53,670 - INFO - session - New optimal found: single_table.classification.mlprimitives.xgboost - 0.8739290413691612\n", - "2020-01-23 20:22:53,677 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n", - "2020-01-23 20:22:55,126 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n", - "2020-01-23 20:23:10,345 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n", - "2020-01-23 20:23:15,497 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n", - "2020-01-23 20:23:28,746 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n" + "2020-09-16 16:34:46,679 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n", + "2020-09-16 16:35:39,310 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n", + "2020-09-16 16:36:53,519 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n", + "2020-09-16 16:37:31,639 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n", + "2020-09-16 16:37:34,254 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n", + "2020-09-16 16:38:33,930 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n", + "2020-09-16 16:38:46,228 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n", + "2020-09-16 16:39:09,193 - INFO - btb.session - New optimal found: single_table.classification - 0.8730998313333643\n", + "2020-09-16 16:39:09,199 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n", + "2020-09-16 16:40:06,793 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n", + "2020-09-16 16:40:44,917 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n", + "2020-09-16 16:41:19,357 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n", + "2020-09-16 16:41:29,076 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n", + "2020-09-16 16:41:46,742 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n", + "2020-09-16 16:42:24,199 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n", + "2020-09-16 16:42:37,998 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n", + "2020-09-16 16:43:03,272 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n", + "2020-09-16 16:44:01,301 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n", + "2020-09-16 16:44:12,500 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n", + "2020-09-16 16:44:32,221 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n", + "2020-09-16 16:45:20,148 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n" ] }, { @@ -703,26 +591,26 @@ { "data": { "text/plain": [ - "{'id': 'd9854a57d48100da0f3584dc4490301f',\n", - " 'name': 'single_table.classification.mlprimitives.xgboost',\n", + "{'id': '52f65be5a78a6c557b8c5bf868bfdb7d',\n", + " 'name': 'single_table.classification',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 22,\n", + " 'max_labels'): 97,\n", " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", " 'lowercase'): True,\n", " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", " 'binary'): True,\n", " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", - " 'max_features'): 3863,\n", - " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 193,\n", + " 'max_features'): 270,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'constant',\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 556,\n", " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.29839198565184866,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.19826736959824165,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 4},\n", - " 'score': 0.8739290413691612}" + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.4023947989981499,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9595910516937898,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 6},\n", + " 'score': 0.8730998313333643}" ] }, - "execution_count": 15, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -757,32 +645,32 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'id': 'd9854a57d48100da0f3584dc4490301f',\n", - " 'name': 'single_table.classification.mlprimitives.xgboost',\n", + "{'id': '52f65be5a78a6c557b8c5bf868bfdb7d',\n", + " 'name': 'single_table.classification',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 22,\n", + " 'max_labels'): 97,\n", " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", " 'lowercase'): True,\n", " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", " 'binary'): True,\n", " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", - " 'max_features'): 3863,\n", - " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 193,\n", + " 'max_features'): 270,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'constant',\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 556,\n", " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.29839198565184866,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.19826736959824165,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 4},\n", - " 'score': 0.8739290413691612}" + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.4023947989981499,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9595910516937898,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 6},\n", + " 'score': 0.8730998313333643}" ] }, - "execution_count": 16, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -794,7 +682,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -818,7 +706,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 16, "metadata": { "scrolled": false }, @@ -826,25 +714,36 @@ { "data": { "text/plain": [ - "[{'id': '9dd9a11254f46b11ad42a12692b4965e',\n", - " 'name': 'single_table.classification.categorical_encoder.logit',\n", + "[{'id': 'c2cd14c7e9470448a0eeb58a3cce327f',\n", + " 'name': 'single_table.classification',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", " 'max_labels'): 0,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'lowercase'): True,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'binary'): True,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'max_features'): 1000,\n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", - " ('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): True,\n", - " ('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 100,\n", - " ('sklearn.linear_model.LogisticRegression#1', 'solver'): 'liblinear',\n", - " ('sklearn.linear_model.LogisticRegression#1', 'penalty'): 'l2',\n", - " ('sklearn.linear_model.LogisticRegression#1', 'C'): 1.0,\n", - " ('sklearn.linear_model.LogisticRegression#1', 'multi_class'): 'ovr',\n", - " ('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 1.0,\n", - " ('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.0001,\n", - " ('sklearn.linear_model.LogisticRegression#1', 'dual'): False},\n", - " 'score': 0.7975185708718643},\n", - " {'id': 'f7ef0814341cee4f05280077b9b3de9c',\n", - " 'name': 'single_table.classification.categorical_encoder.random_forest',\n", - " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 0,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", + " 'score': 0.8639171383183359},\n", + " {'id': 'adbd189a819483ddc869ceb94513b369',\n", + " 'name': 'single_table.classification.text',\n", + " 'config': {('mlprimitives.custom.text.TextCleaner#1', 'lower'): True,\n", + " ('mlprimitives.custom.text.TextCleaner#1', 'accents'): True,\n", + " ('mlprimitives.custom.text.TextCleaner#1', 'stopwords'): True,\n", + " ('mlprimitives.custom.text.TextCleaner#1', 'non_alpha'): True,\n", + " ('mlprimitives.custom.text.TextCleaner#1', 'single_chars'): True,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'lowercase'): True,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'binary'): True,\n", + " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", + " 'max_features'): 1000,\n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", " ('sklearn.ensemble.RandomForestClassifier#1', 'n_estimators'): 10,\n", " ('sklearn.ensemble.RandomForestClassifier#1', 'criterion'): 'gini',\n", @@ -858,10 +757,10 @@ " ('sklearn.ensemble.RandomForestClassifier#1', 'min_impurity_decrease'): 0.0,\n", " ('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True,\n", " ('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False},\n", - " 'score': 0.7591904454179904}]" + " 'score': None}]" ] }, - "execution_count": 20, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -887,7 +786,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.9" } }, "nbformat": 4, diff --git a/setup.py b/setup.py index 945385da..b1aafccb 100644 --- a/setup.py +++ b/setup.py @@ -20,15 +20,20 @@ examples_require = [ - 'mlprimitives>=0.2.4.dev0', - 'jupyter==1.0.0' + 'matplotlib>=2.2.2,<3.2.2', + 'mlprimitives>=0.2.5,<0.3', + 'boto3>=1.14,<1.14.45', + 'botocore<1.17.45,>=1.17.44', + 'jupyter==1.0.0', + 'docutils<0.16,>=0.10', + 'baytune>=0.3.0,<0.4', ] tests_require = [ 'pytest>=3.4.2', 'pytest-cov>=2.6.0', - 'mlprimitives>=0.2.4.dev0', + 'mlprimitives>=0.2,<0.3', 'setuptools>=41.0.0', 'numpy<1.17', 'rundoc>=0.4.3', @@ -43,34 +48,32 @@ development_requires = [ # general - 'bumpversion>=0.5.3', + 'bumpversion>=0.5.3,<0.6', 'pip>=9.0.1', - 'watchdog>=0.8.3', + 'watchdog>=0.8.3,<0.11', # docs 'm2r>=0.2.0,<0.3', 'Sphinx>=1.7.1,<3', - 'sphinx_rtd_theme>=0.2.4', + 'sphinx_rtd_theme>=0.2.4,<0.5', 'ipython>=6.5.0', - 'matplotlib>=2.2.3', 'autodocsumm>=0.1.10', - 'docutils<0.15,>=0.10', # botocore incompatibility with 0.15 # style check - 'flake8>=3.5.0,<3.8', + 'flake8>=3.7.7,<4', 'isort>=4.3.4,<5', # fix style issues - 'autoflake>=1.2', # keep this after flake8 to avoid - 'autopep8>=1.3.5', # version incompatibilities with flake8 + 'autoflake>=1.1,<2', + 'autopep8>=1.4.3,<2', # distribute on PyPI - 'twine>=1.10.0', + 'twine>=1.10.0,<4', 'wheel>=0.30.0', # Advanced testing - 'tox>=2.9.1', - 'coverage>=4.5.1', + 'coverage>=4.5.1,<6', + 'tox>=2.9.1,<4', # Documentation style 'doc8>=0.8.0', @@ -93,7 +96,7 @@ description="Pipelines and primitives for machine learning and data science.", extras_require={ 'dev': development_requires + tests_require + examples_require, - 'test': tests_require, + 'test': tests_require + examples_require, 'examples': examples_require, }, include_package_data=True, diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index ffdd8deb..59e11633 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -879,6 +879,7 @@ def test_predict_debug(self): for block_name, dictionary in expected_return.items(): assert set(debug_returned[block_name].keys()) == set(dictionary.keys()) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_diagram_simple(self): f = open('tests/data/diagrams/diagram_simple.txt', 'r') expected = f.read()[:-1] @@ -984,4 +985,4 @@ def test_from_dict(self): pass def test_load(self): - pass \ No newline at end of file + pass diff --git a/tox.ini b/tox.ini index 1b8a777e..96d29dbe 100644 --- a/tox.ini +++ b/tox.ini @@ -1,37 +1,20 @@ [tox] -envlist = py35, py36, lint, docs, readme - +envlist = py3{5,6}, test-devel [travis] python = - 3.6: py36, lint, docs + 3.6: py36, test-devel 3.5: py35 - [testenv] passenv = CI TRAVIS TRAVIS_* -setenv = - PYTHONPATH = {toxinidir} +skipsdist = false +skip_install = false extras = test commands = /usr/bin/env make test - -[testenv:lint] -skipsdist = true -extras = dev -commands = - /usr/bin/env make lint - - -[testenv:docs] -skipsdist = true +[testenv:test-devel] extras = dev commands = - /usr/bin/env make docs - - -[testenv:readme] -skipsdist = true -commands = - /usr/bin/env make test-readme + /usr/bin/env make test-devel From 6ac5731d69c71533499fbac8ac90932289ebc1c7 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 16 Sep 2020 18:10:55 +0200 Subject: [PATCH 116/160] Add travis wait --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 7c63a880..97f4bcf8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ install: - pip install -U tox-travis codecov # Command to run tests -script: tox +script: travis_wait tox after_success: codecov From 0fac3ce2cc2f4d4982982c52e63d1e9198a91896 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 16 Sep 2020 20:07:57 +0200 Subject: [PATCH 117/160] travis wait 60 min --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 97f4bcf8..51ac1dd8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ install: - pip install -U tox-travis codecov # Command to run tests -script: travis_wait tox +script: travis_wait 60 tox after_success: codecov From f6bff86bb061a85789981bfbf0a0366c6cab7f95 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 19 Nov 2020 12:47:35 +0100 Subject: [PATCH 118/160] Remove unused datasets module --- mlblocks/datasets.py | 447 ----------------------------------------- tests/test_datasets.py | 58 ------ 2 files changed, 505 deletions(-) delete mode 100644 mlblocks/datasets.py delete mode 100644 tests/test_datasets.py diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py deleted file mode 100644 index 0c69afda..00000000 --- a/mlblocks/datasets.py +++ /dev/null @@ -1,447 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Datasets module. - -This module contains functions that allow loading datasets for easy -testing of pipelines and primitives over multiple data modalities -and task types. - -The available datasets by data modality and task type are: - -+---------------+---------------+-------------------------+ -| Dataset | Data Modality | Task Type | -+===============+===============+=========================+ -| Amazon | Graph | Community Detection | -+---------------+---------------+-------------------------+ -| DIC28 | Graph | Graph Matching | -+---------------+---------------+-------------------------+ -| UMLs | Graph | Link Prediction | -+---------------+---------------+-------------------------+ -| Nomination | Graph | Vertex Nomination | -+---------------+---------------+-------------------------+ -| USPS | Image | Classification | -+---------------+---------------+-------------------------+ -| Hand Geometry | Image | Regression | -+---------------+---------------+-------------------------+ -| Iris | Single Table | Classification | -+---------------+---------------+-------------------------+ -| Jester | Single Table | Collaborative Filtering | -+---------------+---------------+-------------------------+ -| Boston | Single Table | Regression | -+---------------+---------------+-------------------------+ -| Wiki QA | Multi Table | Classification | -+---------------+---------------+-------------------------+ -| Personae | Text | Classification | -+---------------+---------------+-------------------------+ -| News Groups | Text | Classification | -+---------------+---------------+-------------------------+ - -""" - -import io -import logging -import os -import tarfile -import urllib - -import networkx as nx -import numpy as np -import pandas as pd -from keras.preprocessing.image import img_to_array, load_img -from sklearn import datasets -from sklearn.metrics import accuracy_score, normalized_mutual_info_score, r2_score -from sklearn.model_selection import KFold, StratifiedKFold, train_test_split - -LOGGER = logging.getLogger(__name__) - -INPUT_SHAPE = [224, 224, 3] - -DATA_PATH = os.path.join( - os.path.dirname(__file__), - 'data' -) -DATA_URL = 'http://dai-mlblocks.s3.amazonaws.com/%7B%7D.tar.gz' - - -class Dataset(): - """Dataset class. - - This class represents the abstraction of a dataset and works as - a container of all the things needed in order to use a dataset - for testing. - - Among other things, it includes the actual dataset data, information - about its origin, a score function that works for this dataset, - and a method to split the data in multiple ways for goodnes-of-fit - evaluation. - - Attributes: - name (str): Name of this dataset. - description (str): Short description about the data that composes this dataset. - data (array-like): Numpy array or pandas DataFrame containing all the data of - this dataset, excluding the labels or target values. - target (array-like): Numpy array or pandas Series containing the expected labels - or values - **kwargs: Any additional keyword argument passed on initailization is also - available as instance attributes. - - Args: - description (str): Short description about the data that composes this dataset. - The first line of the description is expected to be a human friendly - name for the dataset, and will be set as the `name` attribute. - data (array-like): Numpy array or pandas DataFrame containing all the data of - this dataset, excluding the labels or target values. - target (array-like): Numpy array or pandas Series containing the expected labels - or values - score (callable): Function that will be used to compute the score of this dataset. - shuffle (bool): Whether or not to shuffle the data before splitting. - stratify (bool): Whther to use a stratified or regular KFold for splitting. - **kwargs: Any additional keyword argument passed on initialization will be made - available as instance attributes. - """ - - def __init__(self, description, data, target, score, shuffle=True, stratify=False, **kwargs): - - self.name = description.splitlines()[0] - self.description = description - - self.data = data - self.target = target - - self._stratify = stratify - self._shuffle = shuffle - self._score = score - - self.__dict__.update(kwargs) - - def score(self, *args, **kwargs): - r"""Scoring function for this dataset. - - Args: - \*args, \*\*kwargs: Any given arguments and keyword arguments will be - directly passed to the given scoring function. - - Returns: - float: - The computed score. - """ - return self._score(*args, **kwargs) - - def __repr__(self): - return self.name - - def describe(self): - """Print the description of this Dataset on stdout.""" - print(self.description) - - @staticmethod - def _get_split(data, index): - if hasattr(data, 'iloc'): - return data.iloc[index] - else: - return data[index] - - def get_splits(self, n_splits=1, random_state=0): - """Return splits of this dataset ready for Cross Validation. - - If n_splits is 1, a tuple containing the X for train and test - and the y for train and test is returned. - Otherwise, if n_splits is bigger than 1, a list of such tuples - is returned, one for each split. - - Args: - n_splits (int): Number of times that the data needs to be splitted. - - Returns: - tuple or list: - if n_splits is 1, a tuple containing the X for train and test - and the y for train and test is returned. - Otherwise, if n_splits is bigger than 1, a list of such tuples - is returned, one for each split. - """ - if n_splits == 1: - stratify = self.target if self._stratify else None - - return train_test_split( - self.data, - self.target, - shuffle=self._shuffle, - stratify=stratify, - random_state=random_state - ) - - else: - cv_class = StratifiedKFold if self._stratify else KFold - cv = cv_class(n_splits=n_splits, shuffle=self._shuffle, random_state=random_state) - - splits = list() - for train, test in cv.split(self.data, self.target): - X_train = self._get_split(self.data, train) - y_train = self._get_split(self.target, train) - X_test = self._get_split(self.data, test) - y_test = self._get_split(self.target, test) - splits.append((X_train, X_test, y_train, y_test)) - - return splits - - -def _download(dataset_name, dataset_path): - url = DATA_URL.format(dataset_name) - - LOGGER.debug('Downloading dataset %s from %s', dataset_name, url) - response = urllib.request.urlopen(url) - bytes_io = io.BytesIO(response.read()) - - LOGGER.debug('Extracting dataset into %s', DATA_PATH) - with tarfile.open(fileobj=bytes_io, mode='r:gz') as tf: - tf.extractall(DATA_PATH) - - -def _load(dataset_name): - if not os.path.exists(DATA_PATH): - os.makedirs(DATA_PATH) - - dataset_path = os.path.join(DATA_PATH, dataset_name) - if not os.path.exists(dataset_path): - _download(dataset_name, dataset_path) - - return dataset_path - - -def _load_images(image_dir, filenames): - LOGGER.debug('Loading %s images from %s', len(filenames), image_dir) - images = [] - for filename in filenames: - filename = os.path.join(image_dir, filename) - - image = load_img(filename) - image = image.resize(tuple(INPUT_SHAPE[0:2])) - image = img_to_array(image) - image = image / 255.0 # Quantize images. - images.append(image) - - return np.array(images) - - -def _load_csv(dataset_path, name, set_index=False): - csv_path = os.path.join(dataset_path, name + '.csv') - - LOGGER.debug('Loading csv %s', csv_path) - df = pd.read_csv(csv_path) - - if set_index: - df = df.set_index(df.columns[0], drop=False) - - return df - - -def load_usps(): - """USPs Digits Dataset. - - The data of this dataset is a 3d numpy array vector with shape (224, 224, 3) - containing 9298 224x224 RGB photos of handwritten digits, and the target is - a 1d numpy integer array containing the label of the digit represented in - the image. - """ - dataset_path = _load('usps') - - df = _load_csv(dataset_path, 'data') - X = _load_images(os.path.join(dataset_path, 'images'), df.image) - y = df.label.values - - return Dataset(load_usps.__doc__, X, y, accuracy_score, stratify=True) - - -def load_handgeometry(): - """Hand Geometry Dataset. - - The data of this dataset is a 3d numpy array vector with shape (224, 224, 3) - containing 112 224x224 RGB photos of hands, and the target is a 1d numpy - float array containing the width of the wrist in centimeters. - """ - dataset_path = _load('handgeometry') - - df = _load_csv(dataset_path, 'data') - X = _load_images(os.path.join(dataset_path, 'images'), df.image) - y = df.target.values - - return Dataset(load_handgeometry.__doc__, X, y, r2_score) - - -def load_personae(): - """Personae Dataset. - - The data of this dataset is a 2d numpy array vector containing 145 entries - that include texts written by Dutch users in Twitter, with some additional - information about the author, and the target is a 1d numpy binary integer - array indicating whether the author was extrovert or not. - """ - dataset_path = _load('personae') - - X = _load_csv(dataset_path, 'data') - y = X.pop('label').values - - return Dataset(load_personae.__doc__, X, y, accuracy_score, stratify=True) - - -def load_umls(): - """UMLs Dataset. - - The data consists of information about a 135 Graph and the relations between - their nodes given as a DataFrame with three columns, source, target and type, - indicating which nodes are related and with which type of link. The target is - a 1d numpy binary integer array indicating whether the indicated link exists - or not. - """ - dataset_path = _load('umls') - - X = _load_csv(dataset_path, 'data') - y = X.pop('label').values - - graph = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph.gml'))) - - return Dataset(load_umls.__doc__, X, y, accuracy_score, stratify=True, graph=graph) - - -def load_dic28(): - """DIC28 Dataset from Pajek. - - This network represents connections among English words in a dictionary. - It was generated from Knuth's dictionary. Two words are connected by an - edge if we can reach one from the other by - - changing a single character (e. g., work - word) - - adding / removing a single character (e. g., ever - fever). - - There exist 52,652 words (vertices in a network) having 2 up to 8 characters - in the dictionary. The obtained network has 89038 edges. - """ - dataset_path = _load('dic28') - - X = _load_csv(dataset_path, 'data') - y = X.pop('label').values - - graph1 = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph1.gml'))) - graph2 = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph2.gml'))) - - graph = graph1.copy() - graph.add_nodes_from(graph2.nodes(data=True)) - graph.add_edges_from(graph2.edges) - graph.add_edges_from(X[['graph1', 'graph2']].values) - - graphs = { - 'graph1': graph1, - 'graph2': graph2, - } - - return Dataset(load_dic28.__doc__, X, y, accuracy_score, - stratify=True, graph=graph, graphs=graphs) - - -def load_nomination(): - """Sample 1 of graph vertex nomination data from MII Lincoln Lab. - - Data consists of one graph whose nodes contain two attributes, attr1 and attr2. - Associated with each node is a label that has to be learned and predicted. - """ - dataset_path = _load('nomination') - - X = _load_csv(dataset_path, 'data') - y = X.pop('label').values - - graph = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph.gml'))) - - return Dataset(load_nomination.__doc__, X, y, accuracy_score, stratify=True, graph=graph) - - -def load_amazon(): - """Amazon product co-purchasing network and ground-truth communities. - - Network was collected by crawling Amazon website. It is based on Customers Who Bought - This Item Also Bought feature of the Amazon website. If a product i is frequently - co-purchased with product j, the graph contains an undirected edge from i to j. - Each product category provided by Amazon defines each ground-truth community. - """ - dataset_path = _load('amazon') - - X = _load_csv(dataset_path, 'data') - y = X.pop('label').values - - graph = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph.gml'))) - - return Dataset(load_amazon.__doc__, X, y, normalized_mutual_info_score, graph=graph) - - -def load_jester(): - """Ratings from the Jester Online Joke Recommender System. - - This dataset consists of over 1.7 million instances of (user_id, item_id, rating) - triples, which is split 50-50 into train and test data. - - source: "University of California Berkeley, CA" - sourceURI: "http://eigentaste.berkeley.edu/dataset/" - """ - dataset_path = _load('jester') - - X = _load_csv(dataset_path, 'data') - y = X.pop('rating').values - - return Dataset(load_jester.__doc__, X, y, r2_score) - - -def load_wikiqa(): - """Challenge Dataset for Open-Domain Question Answering. - - WikiQA dataset is a publicly available set of question and sentence (QS) pairs, - collected and annotated for research on open-domain question answering. - - source: "Microsoft" - sourceURI: "https://www.microsoft.com/en-us/research/publication/wikiqa-a-challenge-dataset-for-open-domain-question-answering/#" - """ # noqa - dataset_path = _load('wikiqa') - - data = _load_csv(dataset_path, 'data', set_index=True) - questions = _load_csv(dataset_path, 'questions', set_index=True) - sentences = _load_csv(dataset_path, 'sentences', set_index=True) - vocabulary = _load_csv(dataset_path, 'vocabulary', set_index=True) - - entities = { - 'data': (data, 'd3mIndex', None), - 'questions': (questions, 'qIndex', None), - 'sentences': (sentences, 'sIndex', None), - 'vocabulary': (vocabulary, 'index', None) - } - relationships = [ - ('questions', 'qIndex', 'data', 'qIndex'), - ('sentences', 'sIndex', 'data', 'sIndex') - ] - - target = data.pop('isAnswer').values - - return Dataset(load_wikiqa.__doc__, data, target, accuracy_score, startify=True, - entities=entities, relationships=relationships) - - -def load_newsgroups(): - """20 News Groups Dataset. - - The data of this dataset is a 1d numpy array vector containing the texts - from 11314 newsgroups posts, and the target is a 1d numpy integer array - containing the label of one of the 20 topics that they are about. - """ - dataset = datasets.fetch_20newsgroups() - return Dataset(load_newsgroups.__doc__, np.array(dataset.data), dataset.target, - accuracy_score, stratify=True) - - -def load_iris(): - """Iris Dataset.""" - dataset = datasets.load_iris() - return Dataset(load_iris.__doc__, dataset.data, dataset.target, - accuracy_score, stratify=True) - - -def load_boston(): - """Boston House Prices Dataset.""" - dataset = datasets.load_boston() - return Dataset(load_boston.__doc__, dataset.data, dataset.target, r2_score) diff --git a/tests/test_datasets.py b/tests/test_datasets.py deleted file mode 100644 index 174a85d6..00000000 --- a/tests/test_datasets.py +++ /dev/null @@ -1,58 +0,0 @@ -# -*- coding: utf-8 -*- - -from unittest import TestCase -from unittest.mock import Mock - -from mlblocks import datasets - - -class TestDataset(TestCase): - - def setUp(self): - self.description = """Dataset Name. - - Some extended description. - """ - self.score = Mock() - self.score.return_value = 1.0 - - self.dataset = datasets.Dataset( - self.description, 'data', 'target', self.score, - shuffle=False, stratify=True, some='kwargs') - - def test___init__(self): - - assert self.dataset.name == 'Dataset Name.' - assert self.dataset.description == self.description - assert self.dataset.data == 'data' - assert self.dataset.target == 'target' - assert self.dataset._shuffle is False - assert self.dataset._stratify is True - assert self.dataset._score == self.score - assert self.dataset.some == 'kwargs' - - def test_score(self): - returned = self.dataset.score('a', b='c') - - assert returned == 1.0 - self.score.assert_called_once_with('a', b='c') - - def test___repr__(self): - repr_ = str(self.dataset) - - assert repr_ == "Dataset Name." - - -def test_dataset_describe(capsys): - """Tested here because fixtures are not supported in TestCases.""" - - description = """Dataset Name. - - Some extended description. - """ - - dataset = datasets.Dataset(description, 'data', 'target', 'score') - dataset.describe() - - captured = capsys.readouterr() - assert captured.out == description + '\n' From 7afbe40e6006ab5c228df7e8f9ae3e3cc3ab1ce5 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 19 Nov 2020 12:48:42 +0100 Subject: [PATCH 119/160] Update python support and dependency ranges --- .travis.yml | 5 +++-- README.md | 2 +- setup.py | 10 ++++++---- tox.ini | 7 ++++--- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index 51ac1dd8..d2a982f2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,8 +2,9 @@ dist: bionic language: python python: + - 3.8 + - 3.7 - 3.6 - - 3.5 # Command to install dependencies install: @@ -26,4 +27,4 @@ deploy: target-branch: gh-pages on: branch: master - python: 3.6 + python: 3.8 diff --git a/README.md b/README.md index fa4260d5..127089ac 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ Features include: ## Requirements -**MLBlocks** has been developed and tested on [Python 3.5 and 3.6](https://www.python.org/downloads/) +**MLBlocks** has been developed and tested on [Python 3.6, 3.7 and 3.8](https://www.python.org/downloads/) Also, although it is not strictly required, the usage of a [virtualenv](https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid diff --git a/setup.py b/setup.py index b1aafccb..b07eccb6 100644 --- a/setup.py +++ b/setup.py @@ -16,12 +16,13 @@ install_requires = [ 'graphviz>=0.9,<1', + 'numpy>=1.17.1,<1.19', ] examples_require = [ 'matplotlib>=2.2.2,<3.2.2', - 'mlprimitives>=0.2.5,<0.3', + 'mlprimitives>=0.2.6.dev0,<0.3', 'boto3>=1.14,<1.14.45', 'botocore<1.17.45,>=1.17.44', 'jupyter==1.0.0', @@ -33,9 +34,8 @@ tests_require = [ 'pytest>=3.4.2', 'pytest-cov>=2.6.0', - 'mlprimitives>=0.2,<0.3', + 'mlprimitives>=0.2.6.dev0,<0.3', 'setuptools>=41.0.0', - 'numpy<1.17', 'rundoc>=0.4.3', 'prompt-toolkit>=2.0,<3.0', ] @@ -90,8 +90,9 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', ], description="Pipelines and primitives for machine learning and data science.", extras_require={ @@ -107,6 +108,7 @@ long_description_content_type='text/markdown', name='mlblocks', packages=find_packages(include=['mlblocks', 'mlblocks.*']), + python_requires='>=3.6,<3.9', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, diff --git a/tox.ini b/tox.ini index 96d29dbe..1bc3f81a 100644 --- a/tox.ini +++ b/tox.ini @@ -1,10 +1,11 @@ [tox] -envlist = py3{5,6}, test-devel +envlist = py3{6,7,8}, test-devel [travis] python = - 3.6: py36, test-devel - 3.5: py35 + 3.8: py38, test-devel + 3.7: py37 + 3.6: py36 [testenv] passenv = CI TRAVIS TRAVIS_* From cf419bd64b2f90aafd0e56df25325103c646e45e Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 13 Dec 2020 18:16:46 +0100 Subject: [PATCH 120/160] Update baytune dependency --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b07eccb6..6ae0c75e 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ 'botocore<1.17.45,>=1.17.44', 'jupyter==1.0.0', 'docutils<0.16,>=0.10', - 'baytune>=0.3.0,<0.4', + 'baytune>=0.3.13.dev0,<0.4', ] From 68774a040ee489ee4abbb1e73acfe30ab556bfb2 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 13 Dec 2020 18:31:08 +0100 Subject: [PATCH 121/160] Change links to mlbazaar --- CONTRIBUTING.rst | 8 +++--- HISTORY.md | 26 +++++++++---------- README.md | 22 ++++++++-------- docs/advanced_usage/adding_primitives.rst | 6 ++--- docs/advanced_usage/hyperparameters.rst | 4 +-- docs/advanced_usage/primitives.rst | 6 ++--- docs/conf.py | 2 +- docs/getting_started/install.rst | 2 +- docs/getting_started/quickstart.rst | 2 +- docs/index.rst | 6 ++--- docs/pipeline_examples/graph.rst | 2 +- docs/pipeline_examples/image.rst | 2 +- docs/pipeline_examples/multi_table.rst | 2 +- docs/pipeline_examples/text.rst | 6 ++--- examples/README.md | 6 ++--- .... Setting MLPipeline Hyperparameters.ipynb | 2 +- examples/tutorials/7. Tuning a Pipeline.ipynb | 2 +- ...or the best pipeline with BTBSession.ipynb | 2 +- mlblocks/__init__.py | 2 +- setup.py | 2 +- 20 files changed, 56 insertions(+), 56 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 4c01093e..43acf3a0 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -15,7 +15,7 @@ Types of Contributions Report Bugs ~~~~~~~~~~~ -Report bugs at https://github.com/HDI-Project/MLBlocks/issues. +Report bugs at https://github.com/MLBazaar/MLBlocks/issues. If you are reporting a bug, please include: @@ -45,7 +45,7 @@ articles, and such. Submit Feedback ~~~~~~~~~~~~~~~ -The best way to send feedback is to file an issue at https://github.com/HDI-Project/MLBlocks/issues. +The best way to send feedback is to file an issue at https://github.com/MLBazaar/MLBlocks/issues. If you are proposing a feature: @@ -120,8 +120,8 @@ Before you submit a pull request, check that it meets these guidelines: 4. If the pull request adds functionality, the docs should be updated. Put your new functionality into a function with a docstring, and add the feature to the list in README.rst. -5. The pull request should work for Python2.7, 3.4, 3.5 and 3.6. Check - https://travis-ci.org/HDI-Project/MLBlocks/pull_requests +5. The pull request should work for all the supported python version. Check + https://travis-ci.org/MLBazaar/MLBlocks/pull_requests and make sure that all the checks pass. Unit Testing Guidelines diff --git a/HISTORY.md b/HISTORY.md index 5b5d4f0b..17bbda92 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,31 +4,31 @@ Changelog 0.3.4 - 2019-11-01 ------------------ -* Ability to return intermediate context - [Issue #110](https://github.com/HDI-Project/MLBlocks/issues/110) by @csala -* Support for static or class methods - [Issue #107](https://github.com/HDI-Project/MLBlocks/issues/107) by @csala +* Ability to return intermediate context - [Issue #110](https://github.com/MLBazaar/MLBlocks/issues/110) by @csala +* Support for static or class methods - [Issue #107](https://github.com/MLBazaar/MLBlocks/issues/107) by @csala 0.3.3 - 2019-09-09 ------------------ -* Improved intermediate outputs management - [Issue #105](https://github.com/HDI-Project/MLBlocks/issues/105) by @csala +* Improved intermediate outputs management - [Issue #105](https://github.com/MLBazaar/MLBlocks/issues/105) by @csala 0.3.2 - 2019-08-12 ------------------ -* Allow passing fit and produce arguments as `init_params` - [Issue #96](https://github.com/HDI-Project/MLBlocks/issues/96) by @csala -* Support optional fit and produce args and arg defaults - [Issue #95](https://github.com/HDI-Project/MLBlocks/issues/95) by @csala -* Isolate primitives from their hyperparameters dictionary - [Issue #94](https://github.com/HDI-Project/MLBlocks/issues/94) by @csala -* Add functions to explore the available primitives and pipelines - [Issue #90](https://github.com/HDI-Project/MLBlocks/issues/90) by @csala -* Add primitive caching - [Issue #22](https://github.com/HDI-Project/MLBlocks/issues/22) by @csala +* Allow passing fit and produce arguments as `init_params` - [Issue #96](https://github.com/MLBazaar/MLBlocks/issues/96) by @csala +* Support optional fit and produce args and arg defaults - [Issue #95](https://github.com/MLBazaar/MLBlocks/issues/95) by @csala +* Isolate primitives from their hyperparameters dictionary - [Issue #94](https://github.com/MLBazaar/MLBlocks/issues/94) by @csala +* Add functions to explore the available primitives and pipelines - [Issue #90](https://github.com/MLBazaar/MLBlocks/issues/90) by @csala +* Add primitive caching - [Issue #22](https://github.com/MLBazaar/MLBlocks/issues/22) by @csala 0.3.1 - Pipelines Discovery --------------------------- -* Support flat hyperparameter dictionaries - [Issue #92](https://github.com/HDI-Project/MLBlocks/issues/92) by @csala -* Load pipelines by name and register them as `entry_points` - [Issue #88](https://github.com/HDI-Project/MLBlocks/issues/88) by @csala -* Implement partial re-fit -[Issue #61](https://github.com/HDI-Project/MLBlocks/issues/61) by @csala -* Move argument parsing to MLBlock - [Issue #86](https://github.com/HDI-Project/MLBlocks/issues/86) by @csala -* Allow getting intermediate outputs - [Issue #58](https://github.com/HDI-Project/MLBlocks/issues/58) by @csala +* Support flat hyperparameter dictionaries - [Issue #92](https://github.com/MLBazaar/MLBlocks/issues/92) by @csala +* Load pipelines by name and register them as `entry_points` - [Issue #88](https://github.com/MLBazaar/MLBlocks/issues/88) by @csala +* Implement partial re-fit -[Issue #61](https://github.com/MLBazaar/MLBlocks/issues/61) by @csala +* Move argument parsing to MLBlock - [Issue #86](https://github.com/MLBazaar/MLBlocks/issues/86) by @csala +* Allow getting intermediate outputs - [Issue #58](https://github.com/MLBazaar/MLBlocks/issues/58) by @csala 0.3.0 - New Primitives Discovery -------------------------------- diff --git a/README.md b/README.md index 127089ac..770f34ef 100644 --- a/README.md +++ b/README.md @@ -13,18 +13,18 @@ Pipelines and Primitives for Machine Learning and Data Science. [![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) [![PyPi](https://img.shields.io/pypi/v/mlblocks.svg)](https://pypi.python.org/pypi/mlblocks) -[![Travis](https://travis-ci.org/HDI-Project/MLBlocks.svg?branch=master)](https://travis-ci.org/HDI-Project/MLBlocks) -[![CodeCov](https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/HDI-Project/MLBlocks) +[![Travis](https://travis-ci.org/MLBazaar/MLBlocks.svg?branch=master)](https://travis-ci.org/MLBazaar/MLBlocks) +[![CodeCov](https://codecov.io/gh/MLBazaar/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/MLBazaar/MLBlocks) [![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks)
# MLBlocks -* Free software: [MIT license](https://github.com/HDI-Project/MLBlocks/blob/master/LICENSE) +* Free software: [MIT license](https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE) * Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) -* Documentation: https://HDI-Project.github.io/MLBlocks -* Homepage: https://github.com/HDI-Project/MLBlocks +* Documentation: https://mlbazaar.github.io/MLBlocks +* Homepage: https://github.com/MLBazaar/MLBlocks ## Overview @@ -38,7 +38,7 @@ Features include: no python code to write, carefully curated by Machine Learning and Domain experts. * Extract machine-readable information about which hyperparameters can be tuned and within which ranges, allowing automated integration with Hyperparameter Optimization tools like - [BTB](https://github.com/HDI-Project/BTB). + [BTB](https://github.com/MLBazaar/BTB). * Complex multi-branch pipelines and DAG configurations, with unlimited number of inputs and outputs per primitive. * Easy save and load Pipelines using JSON Annotations. @@ -65,14 +65,14 @@ pip install mlblocks This will pull and install the latest stable release from [PyPi](https://pypi.org/). If you want to install from source or contribute to the project please read the -[Contributing Guide](https://hdi-project.github.io/MLBlocks/contributing.html#get-started). +[Contributing Guide](https://mlbazaar.github.io/MLBlocks/contributing.html#get-started). ## MLPrimitives In order to be usable, MLBlocks requires a compatible primitives library. The official library, required in order to follow the following MLBlocks tutorial, -is [MLPrimitives](https://github.com/HDI-Project/MLPrimitives), which you can install +is [MLPrimitives](https://github.com/MLBazaar/MLPrimitives), which you can install with this command: ```bash @@ -83,7 +83,7 @@ pip install mlprimitives Below there is a short example about how to use **MLBlocks** to solve the [Adult Census Dataset](https://archive.ics.uci.edu/ml/datasets/Adult) classification problem using a -pipeline which combines primitives from [MLPrimitives](https://github.com/HDI-Project/MLPrimitives), +pipeline which combines primitives from [MLPrimitives](https://github.com/MLBazaar/MLPrimitives), [scikit-learn](https://scikit-learn.org/) and [xgboost](https://xgboost.readthedocs.io/). ```python3 @@ -112,10 +112,10 @@ dataset.score(y_test, predictions) If you want to learn more about how to tune the pipeline hyperparameters, save and load the pipelines using JSON annotations or build complex multi-branched pipelines, please -check our [documentation site](https://HDI-Project.github.io/MLBlocks). +check our [documentation site](https://mlbazaar.github.io/MLBlocks). Also do not forget to have a look at the [notebook tutorials]( -https://github.com/HDI-Project/MLBlocks/tree/master/examples/tutorials)! +https://github.com/MLBazaar/MLBlocks/tree/master/examples/tutorials)! # Citing MLBlocks diff --git a/docs/advanced_usage/adding_primitives.rst b/docs/advanced_usage/adding_primitives.rst index 9d358629..5ad0b60b 100644 --- a/docs/advanced_usage/adding_primitives.rst +++ b/docs/advanced_usage/adding_primitives.rst @@ -17,8 +17,8 @@ This can be achieved by running the commands:: For further details, please refer to the `MLPrimitives Documentation`_. -.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives -.. _MLPrimitives Documentation: https://hdi-project.github.io/MLPrimitives/ +.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives +.. _MLPrimitives Documentation: https://mlbazaar.github.io/MLPrimitives/ Writing Primitives ------------------ @@ -27,7 +27,7 @@ Sometimes you will find that you want to use a primitive that is not in the list `MLPrimitives integrated primitives`_, so you will have to integrate the primitive yourself by writing the corresponding `JSON annotation `_. -.. _MLPrimitives integrated primitives: https://github.com/HDI-Project/MLPrimitives/tree/master/mlblocks_primitives +.. _MLPrimitives integrated primitives: https://github.com/MLBazaar/MLPrimitives/tree/master/mlblocks_primitives .. note:: If you create new primitives for MLBlocks, please consider contributing them to the **MLPrimitives** project! diff --git a/docs/advanced_usage/hyperparameters.rst b/docs/advanced_usage/hyperparameters.rst index 71686ac5..488be9a9 100644 --- a/docs/advanced_usage/hyperparameters.rst +++ b/docs/advanced_usage/hyperparameters.rst @@ -221,8 +221,8 @@ In this case, the hyperparameters would be annotated like this:: of type, range and default value as a nested dictionary to be used by default. .. _JSON Annotations: primitives.html#json-annotations -.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives -.. _BTB: https://github.com/HDI-Project/BTB +.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives +.. _BTB: https://github.com/MLBazaar/BTB .. _MLPipeline: ../api_reference.html#mlblocks.MLPipeline .. _multitype: #multitype-hyperparameters .. _conditional: #conditional-hyperparameters diff --git a/docs/advanced_usage/primitives.rst b/docs/advanced_usage/primitives.rst index 58847bbe..37df9031 100644 --- a/docs/advanced_usage/primitives.rst +++ b/docs/advanced_usage/primitives.rst @@ -311,11 +311,11 @@ For a more detailed description of this class, please check the corresponding section in the `API Reference`_ documentation. .. _API Reference: ../api_reference.html -.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives -.. _keras.preprocessing.text.Tokenizer: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/keras.preprocessing.text.Tokenizer.json +.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives +.. _keras.preprocessing.text.Tokenizer: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.preprocessing.text.Tokenizer.json .. _hyperparameters: hyperparameters.html .. _mlblocks.MLBlock: ../api_reference.html#mlblocks.MLBlock .. _pipelines: pipelines.html -.. _examples folder: https://github.com/HDI-Project/MLBlocks/tree/master/examples +.. _examples folder: https://github.com/MLBazaar/MLBlocks/tree/master/examples .. _fit: ../api_reference.html#mlblocks.MLBlock.fit .. _produce: ../api_reference.html#mlblocks.MLBlock.produce diff --git a/docs/conf.py b/docs/conf.py index 5ff266d0..f81b7b7e 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -63,7 +63,7 @@ copyright = '2018, MIT Data To AI Lab' author = 'MIT Data To AI Lab' description = 'Pipelines and Primitives for Machine Learning and Data Science.' -user = 'HDI-Project' +user = 'MLBazaar' # The version info for the project you're documenting, acts as replacement # for |version| and |release|, also used in various other places throughout diff --git a/docs/getting_started/install.rst b/docs/getting_started/install.rst index d2bda921..d64970a2 100644 --- a/docs/getting_started/install.rst +++ b/docs/getting_started/install.rst @@ -30,7 +30,7 @@ is `MLPrimitives`_, which you can install with this command: pip install mlprimitives -.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives +.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives Install for development ----------------------- diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst index b55223dd..386752dc 100644 --- a/docs/getting_started/quickstart.rst +++ b/docs/getting_started/quickstart.rst @@ -123,5 +123,5 @@ to obtain predictions from the pipeline. .. _hyperparameters: ../advanced_usage/hyperparameters.html .. _MLBlocks JSON Annotations: ../advanced_usage/primitives.html#json-annotations .. _get_tunable_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_tunable_hyperparameters -.. _BTB: https://github.com/HDI-Project/BTB +.. _BTB: https://github.com/MLBazaar/BTB .. _set_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.set_hyperparameters diff --git a/docs/index.rst b/docs/index.rst index e891230c..85717469 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,9 +6,9 @@ What is MLBlocks? :alt: MLBlocks :align: center -* Free software: `MIT license `_ -* Documentation: https://HDI-Project.github.io/MLBlocks -* Homepage: https://github.com/HDI-Project/MLBlocks +* Free software: `MIT license `_ +* Documentation: https://mlbazaar.github.io/MLBlocks +* Homepage: https://github.com/MLBazaar/MLBlocks MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning tools developed in Python, whether they are custom developments or belong to third party diff --git a/docs/pipeline_examples/graph.rst b/docs/pipeline_examples/graph.rst index 54ef85a1..8cde5340 100644 --- a/docs/pipeline_examples/graph.rst +++ b/docs/pipeline_examples/graph.rst @@ -69,6 +69,6 @@ additional information not found inside `X`. .. _NetworkX Link Prediction: https://networkx.github.io/documentation/networkx-1.10/reference/algorithms.link_prediction.html -.. _CategoricalEncoder from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json +.. _CategoricalEncoder from MLPrimitives: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json .. _StandardScaler from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html .. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn diff --git a/docs/pipeline_examples/image.rst b/docs/pipeline_examples/image.rst index e8274761..b9b97ef7 100644 --- a/docs/pipeline_examples/image.rst +++ b/docs/pipeline_examples/image.rst @@ -136,7 +136,7 @@ to an `XGBRegressor`_ primitive. .. _USPS Dataset: https://ieeexplore.ieee.org/document/291440/ .. _OpenCV GaussianBlur function: https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html?highlight=gaussianblur#gaussianblur -.. _MLPrimitives primitive: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json +.. _MLPrimitives primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json .. _scikit-image function: http://scikit-image.org/docs/dev/api/skimage.feature.html#skimage.feature.hog .. _RandomForestClassifier from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html .. _Pretrained Networks from Keras: https://keras.io/applications/ diff --git a/docs/pipeline_examples/multi_table.rst b/docs/pipeline_examples/multi_table.rst index 109f4015..c2c2066f 100644 --- a/docs/pipeline_examples/multi_table.rst +++ b/docs/pipeline_examples/multi_table.rst @@ -49,5 +49,5 @@ tables are. .. _WikiQA dataset: https://www.microsoft.com/en-us/research/publication/wikiqa-a-challenge-dataset-for-open-domain-question-answering/ .. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn -.. _DeepFeatureSynthesis: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/featuretools.dfs.json +.. _DeepFeatureSynthesis: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/featuretools.dfs.json .. _featuretools: https://www.featuretools.com/ diff --git a/docs/pipeline_examples/text.rst b/docs/pipeline_examples/text.rst index 03472ea3..ee0c16ac 100644 --- a/docs/pipeline_examples/text.rst +++ b/docs/pipeline_examples/text.rst @@ -140,9 +140,9 @@ to encode all the string features, and go directly into the .. _Twenty Newsgroups Dataset: http://scikit-learn.org/stable/datasets/twenty_newsgroups.html -.. _TextCleaner primitive: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/text.py -.. _StringVectorizer primitive: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/feature_extraction.py +.. _TextCleaner primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlprimitives/text.py +.. _StringVectorizer primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlprimitives/feature_extraction.py .. _keras text preprocessing: https://keras.io/preprocessing/text/ -.. _Keras LSTM Classifier from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.LSTMTextClassifier.json +.. _Keras LSTM Classifier from MLPrimitives: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.LSTMTextClassifier.json .. _Personae Dataset: https://www.clips.uantwerpen.be/datasets/personae-corpus .. _RandomForestClassifier from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html diff --git a/examples/README.md b/examples/README.md index d295414e..de298ef2 100644 --- a/examples/README.md +++ b/examples/README.md @@ -26,7 +26,7 @@ In order to run these tutorials on your computer, please follow these steps: 1. Clone this github repository: ```bash -git clone git@github.com:HDI-Project/MLBlocks.git +git clone git@github.com:MLBazaar/MLBlocks.git ``` 2. (Optional) Create a virtualenv to execute the examples in an environment isolated from the @@ -45,8 +45,8 @@ cd MLBlocks make install-examples ``` -This will install [MLBLocks](https://github.com/HDI-Project/MLBlocks.git) as well as [MLPrimitives]( -https://github.com/HDI-Project/MLPrimitives.git) and [Jupyter](https://jupyter.org/). +This will install [MLBLocks](https://github.com/MLBazaar/MLBlocks.git) as well as [MLPrimitives]( +https://github.com/MLBazaar/MLPrimitives.git) and [Jupyter](https://jupyter.org/). 4. Enter the `examples` folder and start a Jupyter Notebook: diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb index 5b7944b5..4993fd4e 100644 --- a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb +++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb @@ -122,7 +122,7 @@ "\n", "**NOTE** that here we see the names of the pipeline steps, which are the primitive names with a numerical suffix that allows us to tell the difference between multiple steps that use the same primitive. \n", "\n", - "Alternatively, for better compatibility with tuning systems like [BTB](https://github.com/HDI-Project/BTB)\n", + "Alternatively, for better compatibility with tuning systems like [BTB](https://github.com/MLBazaar/BTB)\n", "that work with flat, one-level, dictionaries, the argument `flat=True` can be passed." ] }, diff --git a/examples/tutorials/7. Tuning a Pipeline.ipynb b/examples/tutorials/7. Tuning a Pipeline.ipynb index 4b6eae24..ca30df17 100644 --- a/examples/tutorials/7. Tuning a Pipeline.ipynb +++ b/examples/tutorials/7. Tuning a Pipeline.ipynb @@ -6,7 +6,7 @@ "source": [ "# Tuning a Pipeline\n", "\n", - "This short guide shows how tune a Pipeline using a [BTB](https://github.com/HDI-Project/BTB) Tuner.\n", + "This short guide shows how tune a Pipeline using a [BTB](https://github.com/MLBazaar/BTB) Tuner.\n", "\n", "Note that some steps are not explained for simplicity. Full details\n", "about them can be found in the previous parts of the tutorial.\n", diff --git a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb index 1fb4d7ca..829a38d6 100644 --- a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb +++ b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb @@ -7,7 +7,7 @@ "# Selecting and Tuning pipelines\n", "\n", "This guide shows you how to search for multiple pipelines for your problem\n", - "and later on use a [BTBSession](https://hdi-project.github.io/BTB/api/btb.session.html#btb.session.BTBSession)\n", + "and later on use a [BTBSession](https://mlbazaar.github.io/BTB/api/btb.session.html#btb.session.BTBSession)\n", "to select and tune the best one.\n", "\n", "Note that some steps are not explained for simplicity. Full details\n", diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 618e7a55..300b9093 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -7,7 +7,7 @@ seamlessly combining tools from any python library with a simple, common and uniform interface. * Free software: MIT license -* Documentation: https://HDI-Project.github.io/MLBlocks +* Documentation: https://MLBazaar.github.io/MLBlocks """ from mlblocks.discovery import ( diff --git a/setup.py b/setup.py index 6ae0c75e..0c67cc8d 100644 --- a/setup.py +++ b/setup.py @@ -112,7 +112,7 @@ setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, - url='https://github.com/HDI-Project/MLBlocks', + url='https://github.com/MLBazaar/MLBlocks', version='0.3.5.dev0', zip_safe=False, ) From b9a6142e77b50eae9ae1a3aad6eae8dc1e1f6e70 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 13 Dec 2020 18:46:15 +0100 Subject: [PATCH 122/160] Prevent travis-ci conflict --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0c67cc8d..4ff3a675 100644 --- a/setup.py +++ b/setup.py @@ -77,7 +77,10 @@ # Documentation style 'doc8>=0.8.0', - 'pydocstyle>=3.0.0' + 'pydocstyle>=3.0.0', + + # Prevent travis-ci conflict + 'chardet<4', ] From 52653e072a17986da77c666fc5f2a73895f4b40b Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 14 Dec 2020 14:03:35 +0100 Subject: [PATCH 123/160] Update Travis badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 770f34ef..103fc113 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Pipelines and Primitives for Machine Learning and Data Science. [![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) [![PyPi](https://img.shields.io/pypi/v/mlblocks.svg)](https://pypi.python.org/pypi/mlblocks) -[![Travis](https://travis-ci.org/MLBazaar/MLBlocks.svg?branch=master)](https://travis-ci.org/MLBazaar/MLBlocks) +[![Travis](https://travis-ci.com/MLBazaar/MLBlocks.svg?branch=master)](https://travis-ci.com/MLBazaar/MLBlocks) [![CodeCov](https://codecov.io/gh/MLBazaar/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/MLBazaar/MLBlocks) [![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks) From c5f3fdfc3de21fffe0053c00fd7d6279243126a9 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev <41479552+pvk-developer@users.noreply.github.com> Date: Tue, 22 Dec 2020 15:23:57 +0100 Subject: [PATCH 124/160] Add memory debug and profile (#130) * Add code for memory consumption and optionally select which debug you would like to use. * Add documentation about debuging * Add psutil * Tests updates * Fix lint * Add extra tests * Update MLPrimitives version * Rephrase documentation --- docs/advanced_usage/pipelines.rst | 36 ++++++ mlblocks/mlpipeline.py | 189 ++++++++++++++++++------------ setup.py | 5 +- tests/test_mlpipeline.py | 184 ++++++++++++++++++++++------- 4 files changed, 294 insertions(+), 120 deletions(-) diff --git a/docs/advanced_usage/pipelines.rst b/docs/advanced_usage/pipelines.rst index e87a0067..07b36c98 100644 --- a/docs/advanced_usage/pipelines.rst +++ b/docs/advanced_usage/pipelines.rst @@ -423,6 +423,42 @@ An example of this situation, where we want to reuse the output of the first blo predictions = pipeline.predict(X_test) score = compute_score(y_test, predictions) +Pipeline debugging +------------------ + +Sometimes we might be interested in debugging a pipeline execution and obtain information +about the time, the memory usage, the inputs and outputs that each step takes. This is possible +by using the argument ``debug`` with the method ``fit`` and ``predict``. This argument allows us +to retrieve critical information from the pipeline execution: + +* ``Time``: Elapsed time for the primitive and the given stage (fit or predict). +* ``Memory``: Amount of memory increase or decrease for the given primitive for that pipeline. +* ``Input``: The input values that the primitive takes for that specific step. +* ``Output``: The output produced by the primitive. + + +If the ``debug`` argument is set to ``True`` then a dictionary will be returned containing all the +elements listed previously:: + + result, debug_info = pipeline.fit(X_train, y_train, debug=True) + +In case you want to retrieve only some of the elements listed above and skip the rest, you can +pass an ``str`` to the ``debug`` argument with any combination of the following characters: + +* ``i``: To include inputs. +* ``o``: To include outputs. +* ``m``: To include used memory. +* ``t``: To include elapsed time. + +For example, if we are only interested on capturing the elapsed time and used memory during the +``fit`` process, we can call the method as follows:: + + result, debug_info = pipeline.fit(X_train, y_train, debug='tm') + +.. warning:: Bear in mind that if we use ``debug=True`` or saving the ``Input`` and ``Output``, + this will consume extra memory ram as it will create copies of the input data and + the output data for each primitive. For profiling it is recommended using the option + ``tm`` as shown in the previous example. .. _API Reference: ../api_reference.html .. _primitives: ../primitives.html diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 6e0744bd..a4111bcb 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -4,6 +4,7 @@ import json import logging +import os import re import warnings from collections import Counter, OrderedDict, defaultdict @@ -11,6 +12,7 @@ from datetime import datetime import numpy as np +import psutil from graphviz import Digraph from mlblocks.discovery import load_pipeline @@ -110,14 +112,14 @@ def _build_blocks(self): if not block_params: block_params = self.init_params.get(primitive_name, dict()) if block_params and block_count > 1: - LOGGER.warning(("Non-numbered init_params are being used " - "for more than one block %s."), primitive_name) + LOGGER.warning(('Non-numbered init_params are being used ' + 'for more than one block %s.'), primitive_name) block = MLBlock(primitive, **block_params) blocks[block_name] = block except Exception: - LOGGER.exception("Exception caught building MLBlock %s", primitive) + LOGGER.exception('Exception caught building MLBlock %s', primitive) raise return blocks @@ -475,8 +477,8 @@ def _sanitize(cls, hyperparameters): is a dict containing a complete hyperparameter specification for that block:: { - "block_name": { - "hyperparameter_name": "hyperparameter_value", + 'block_name': { + 'hyperparameter_name': 'hyperparameter_value', ... }, ... @@ -487,7 +489,7 @@ def _sanitize(cls, hyperparameters): second one:: { - ("block_name", "hyperparameter_name"): "hyperparameter_value", + ('block_name', 'hyperparameter_name'): 'hyperparameter_value', ... } @@ -611,39 +613,52 @@ def _update_outputs(self, variable_name, output_variables, outputs, value): index = output_variables.index(variable_name) outputs[index] = deepcopy(value) - def _fit_block(self, block, block_name, context, debug=None): + def _fit_block(self, block, block_name, context, debug_info=None): """Get the block args from the context and fit the block.""" - LOGGER.debug("Fitting block %s", block_name) + LOGGER.debug('Fitting block %s', block_name) try: fit_args = self._get_block_args(block_name, block.fit_args, context) + process = psutil.Process(os.getpid()) + memory_before = process.memory_info().rss start = datetime.utcnow() block.fit(**fit_args) elapsed = datetime.utcnow() - start + memory_after = process.memory_info().rss - if debug is not None: - debug["fit"][block_name] = { - "elapsed": elapsed.total_seconds(), - "input": fit_args - } + if debug_info is not None: + debug = debug_info['debug'] + record = {} + if 't' in debug: + record['time'] = elapsed.total_seconds() + if 'm' in debug: + record['memory'] = memory_after - memory_before + if 'i' in debug: + record['input'] = deepcopy(fit_args) + + debug_info['fit'][block_name] = record except Exception: if self.verbose: - LOGGER.exception("Exception caught fitting MLBlock %s", block_name) + LOGGER.exception('Exception caught fitting MLBlock %s', block_name) raise - def _produce_block(self, block, block_name, context, output_variables, outputs, debug=None): + def _produce_block(self, block, block_name, context, output_variables, + outputs, debug_info=None): """Get the block args from the context and produce the block. Afterwards, set the block outputs back into the context and update the outputs list if necessary. """ - LOGGER.debug("Producing block %s", block_name) + LOGGER.debug('Producing block %s', block_name) try: produce_args = self._get_block_args(block_name, block.produce_args, context) + process = psutil.Process(os.getpid()) + memory_before = process.memory_info().rss start = datetime.utcnow() block_outputs = block.produce(**produce_args) elapsed = datetime.utcnow() - start + memory_after = process.memory_info().rss outputs_dict = self._extract_outputs(block_name, block_outputs, block.produce_output) context.update(outputs_dict) @@ -656,21 +671,23 @@ def _produce_block(self, block, block_name, context, output_variables, outputs, variable_name = '{}.{}'.format(block_name, key) self._update_outputs(variable_name, output_variables, outputs, value) - if debug is not None: - record = { - "elapsed": elapsed.total_seconds(), - "input": produce_args, - "output": outputs_dict - } + if debug_info is not None: + debug = debug_info['debug'] + record = {} + if 't' in debug: + record['time'] = elapsed.total_seconds() + if 'm' in debug: + record['memory'] = memory_after - memory_before + if 'i' in debug: + record['input'] = deepcopy(produce_args) + if 'o' in debug: + record['output'] = deepcopy(outputs_dict) - if "fit" in debug.keys(): - debug["produce"][block_name] = record - else: - debug[block_name] = record + debug_info['produce'][block_name] = record except Exception: if self.verbose: - LOGGER.exception("Exception caught producing MLBlock %s", block_name) + LOGGER.exception('Exception caught producing MLBlock %s', block_name) raise @@ -692,21 +709,31 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs): y: Fit Data labels, which the pipeline will use to learn how to behave. - output_ (str or int or list or None): Output specification, as required by ``get_outputs``. If ``None`` is given, nothing will be returned. - start_ (str or int or None): Block index or block name to start processing from. The value can either be an integer, which will be interpreted as a block index, or the name of a block, including the conter number at the end. If given, the execution of the pipeline will start on the specified block, and all the blocks before that one will be skipped. - - debug (boolean): - Debug mode, if True a dictionary containing the block names as keys and - the execution time in seconds, input, output as values is returned. + debug (bool or str): + Debug a pipeline with the following options: + + * ``t``: + Elapsed time for the primitive and the given stage (fit or predict). + * ``m``: + Amount of memory incrase (or decrease) for the primitive. This amount + is represented in bytes. + * ``i``: + The input values that the primitive takes for that step. + * ``o``: + The output values that the primitive generates. + + If provided, return a dictionary with the ``fit`` and ``predict`` performance. + This argument can be a string containing a combination of the letters listed above, + or ``True`` which will return a complete debug. **kwargs: Any additional keyword arguments will be directly added @@ -738,13 +765,14 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs): debug_info = None if debug: debug_info = defaultdict(dict) + debug_info['debug'] = debug.lower() if isinstance(debug, str) else 'tmio' for block_name, block in self.blocks.items(): if start_: if block_name == start_: start_ = False else: - LOGGER.debug("Skipping block %s fit", block_name) + LOGGER.debug('Skipping block %s fit', block_name) continue self._fit_block(block, block_name, context, debug_info) @@ -770,13 +798,13 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs): return result - if debug: - return debug_info - if start_: # We skipped all the blocks up to the end raise ValueError('Unknown block name: {}'.format(start_)) + if debug: + return debug_info + def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs): """Produce predictions using the blocks of this pipeline. @@ -791,21 +819,31 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs) Args: X: Data which the pipeline will use to make predictions. - output_ (str or int or list or None): Output specification, as required by ``get_outputs``. If not specified the ``default`` output will be returned. - start_ (str or int or None): Block index or block name to start processing from. The value can either be an integer, which will be interpreted as a block index, or the name of a block, including the conter number at the end. If given, the execution of the pipeline will start on the specified block, and all the blocks before that one will be skipped. - - debug (boolean): - Debug mode, if True a dictionary containing the block names as keys and - the execution time in seconds, input, output as values is returned. + debug (bool or str): + Debug a pipeline with the following options: + + * ``t``: + Elapsed time for the primitive and the given stage (fit or predict). + * ``m``: + Amount of memory incrase (or decrease) for the primitive. This amount + is represented in bytes. + * ``i``: + The input values that the primitive takes for that step. + * ``o``: + The output values that the primitive generates. + + If ``True`` then a dictionary will be returned containing all the elements listed + previously. If a ``string`` value with the combination of letters is given for + each option, it will return a dictionary with the selected elements. **kwargs: Any additional keyword arguments will be directly added @@ -815,6 +853,9 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs) object or tuple: * If a single output is requested, it is returned alone. * If multiple outputs have been requested, a tuple is returned. + * If ``debug`` is given, a tupple will be returned where the first element + returned are the predictions and the second a dictionary containing the debug + information. """ context = kwargs.copy() if X is not None: @@ -827,14 +868,15 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs) debug_info = None if debug: - debug_info = dict() + debug_info = defaultdict(dict) + debug_info['debug'] = debug.lower() if isinstance(debug, str) else 'tmio' for block_name, block in self.blocks.items(): if start_: if block_name == start_: start_ = False else: - LOGGER.debug("Skipping block %s produce", block_name) + LOGGER.debug('Skipping block %s produce', block_name) continue self._produce_block(block, block_name, context, output_variables, outputs, debug_info) @@ -856,9 +898,6 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs) return result - if debug: - return debug_info - if start_: # We skipped all the blocks up to the end raise ValueError('Unknown block name: {}'.format(start_)) @@ -871,32 +910,32 @@ def to_dict(self): specification of the tunable_hyperparameters:: { - "primitives": [ - "a_primitive", - "another_primitive" + 'primitives': [ + 'a_primitive', + 'another_primitive' ], - "init_params": { - "a_primitive": { - "an_argument": "a_value" + 'init_params': { + 'a_primitive': { + 'an_argument': 'a_value' } }, - "hyperparameters": { - "a_primitive#1": { - "an_argument": "a_value", - "another_argument": "another_value", + 'hyperparameters': { + 'a_primitive#1': { + 'an_argument': 'a_value', + 'another_argument': 'another_value', }, - "another_primitive#1": { - "yet_another_argument": "yet_another_value" + 'another_primitive#1': { + 'yet_another_argument': 'yet_another_value' } }, - "tunable_hyperparameters": { - "another_primitive#1": { - "yet_another_argument": { - "type": "str", - "default": "a_default_value", - "values": [ - "a_default_value", - "yet_another_value" + 'tunable_hyperparameters': { + 'another_primitive#1': { + 'yet_another_argument': { + 'type': 'str', + 'default': 'a_default_value', + 'values': [ + 'a_default_value', + 'yet_another_value' ] } } @@ -926,8 +965,8 @@ def _get_simple_block_name(self, block_name): str: block name stripped of number and other modifiers. """ - full_name = block_name.split("#")[0] - simple_name = full_name.split(".")[-1] + full_name = block_name.split('#')[0] + simple_name = full_name.split('.')[-1] return simple_name def _get_context_name_from_variable(self, variable_name): @@ -942,12 +981,12 @@ def _get_context_name_from_variable(self, variable_name): str: Name of the context of the variable. """ - block_name = variable_name.split("#")[0] + block_name = variable_name.split('#')[0] rest = variable_name[len(block_name) + 1:] - block_index = rest.split(".")[0] + block_index = rest.split('.')[0] context_name = rest[len(block_index) + 1:] if len(context_name) == 0: - raise ValueError("Invalid variable name") + raise ValueError('Invalid variable name') return context_name def _get_relevant_output_variables(self, block_name, block, current_output_variables): @@ -1107,7 +1146,7 @@ def _make_diagram_inputs(self, diagram, input_variables_blocks): Dictionary of input variables of the pipeline and the set of tuples of blocks into which the variable connects and the type of arrowhead to use """ - with diagram.subgraph(name="cluster_inputs") as cluster: + with diagram.subgraph(name='cluster_inputs') as cluster: cluster.attr(tooltip='Input variables') cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0') cluster.attr('node', penwidth='0', fontsize='20') @@ -1148,7 +1187,7 @@ def _make_diagram_outputs(self, diagram, outputs): output_variables = [] outputs_vars = self.get_outputs(outputs) - with diagram.subgraph(name="cluster_outputs") as cluster: + with diagram.subgraph(name='cluster_outputs') as cluster: cluster.attr(tooltip='Output variables') cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0') cluster.attr('node', penwidth='0', fontsize='20') diff --git a/setup.py b/setup.py index 4ff3a675..d76236ae 100644 --- a/setup.py +++ b/setup.py @@ -17,12 +17,13 @@ install_requires = [ 'graphviz>=0.9,<1', 'numpy>=1.17.1,<1.19', + 'psutil>=5,<6', ] examples_require = [ 'matplotlib>=2.2.2,<3.2.2', - 'mlprimitives>=0.2.6.dev0,<0.3', + 'mlprimitives>=0.3.0.dev0,<0.4', 'boto3>=1.14,<1.14.45', 'botocore<1.17.45,>=1.17.44', 'jupyter==1.0.0', @@ -34,7 +35,7 @@ tests_require = [ 'pytest>=3.4.2', 'pytest-cov>=2.6.0', - 'mlprimitives>=0.2.6.dev0,<0.3', + 'mlprimitives>=0.3.0.dev0,<0.4', 'setuptools>=41.0.0', 'rundoc>=0.4.3', 'prompt-toolkit>=2.0,<3.0', diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index 59e11633..97c59cd0 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -696,7 +696,7 @@ def test_fit_no_debug(self): assert returned is None @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) - def test_fit_debug(self): + def test_fit_debug_bool(self): mlpipeline = MLPipeline(['a_primitive']) mlpipeline.blocks['a_primitive#1'].fit_args = [ { @@ -706,24 +706,53 @@ def test_fit_debug(self): ] expected_return = dict() - expected_return["fit"] = { - "a_primitive#1": { - "elapsed": 0, - "input": { - "whatever" - } + expected_return['debug'] = 'tmio' + expected_return['fit'] = { + 'a_primitive#1': { + 'time': 0, + 'input': { + 'whatever' + }, + 'memory': 0, } } returned = mlpipeline.fit(debug=True) - print(returned) assert isinstance(returned, dict) assert set(returned.keys()) == set(expected_return.keys()) # fit / produce - assert set(returned["fit"].keys()) == set(expected_return["fit"].keys()) # block name + assert set(returned['fit'].keys()) == set(expected_return['fit'].keys()) # block name + + for block_name, dictionary in expected_return['fit'].items(): + assert set(returned['fit'][block_name].keys()) == set(dictionary.keys()) + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_fit_debug_str(self): + mlpipeline = MLPipeline(['a_primitive']) + mlpipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + + expected_return = dict() + expected_return['debug'] = 'tm' + expected_return['fit'] = { + 'a_primitive#1': { + 'time': 0, + 'memory': 0, + } + } + + returned = mlpipeline.fit(debug='tm') + + assert isinstance(returned, dict) + assert set(returned.keys()) == set(expected_return.keys()) # fit / produce + assert set(returned['fit'].keys()) == set(expected_return['fit'].keys()) # block name - for block_name, dictionary in expected_return["fit"].items(): - assert set(returned["fit"][block_name].keys()) == set(dictionary.keys()) + for block_name, dictionary in expected_return['fit'].items(): + assert set(returned['fit'][block_name].keys()) == set(dictionary.keys()) @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_fit_produce_debug(self): @@ -759,39 +788,104 @@ def test_fit_produce_debug(self): ] expected_return = dict() - expected_return["fit"] = { - "a_primitive#1": { - "elapsed": 0, - "input": { - "whatever" - } + expected_return['debug'] = 'tmio' + expected_return['fit'] = { + 'a_primitive#1': { + 'time': 0, + 'input': { + 'whatever' + }, + 'memory': 0, } } - expected_return["produce"] = { - "a_primitive#1": { - "elapsed": 0, - "input": { - "whatever" + expected_return['produce'] = { + 'a_primitive#1': { + 'time': 0, + 'input': { + 'whatever' }, - "output": { - "whatever" - } + 'output': { + 'whatever' + }, + 'memory': 0, } } returned, debug_returned = mlpipeline.fit(output_='default', debug=True) - assert len([returned]) == len(outputs["default"]) + assert len([returned]) == len(outputs['default']) + assert isinstance(debug_returned, dict) + assert set(debug_returned.keys()) == set(expected_return.keys()) # fit / produce + assert set(debug_returned['fit'].keys()) == set(expected_return['fit'].keys()) + assert set(debug_returned['produce'].keys()) == set(expected_return['produce'].keys()) + + for block_name, dictionary in expected_return['fit'].items(): + assert set(debug_returned['fit'][block_name].keys()) == set(dictionary.keys()) + + for block_name, dictionary in expected_return['produce'].items(): + assert set(debug_returned['produce'][block_name].keys()) == set(dictionary.keys()) + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_fit_produce_debug_str(self): + outputs = { + 'default': [ + { + 'name': 'a_name', + 'variable': 'a_primitive#1.a_variable', + 'type': 'a_type', + } + ] + } + mlpipeline = MLPipeline(['a_primitive'], outputs=outputs) + mlpipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + + mlpipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + + mlpipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'a_name', + 'type': 'a_type' + } + ] + + expected_return = dict() + expected_return['debug'] = 'tm' + expected_return['fit'] = { + 'a_primitive#1': { + 'time': 0, + 'memory': 0, + } + } + expected_return['produce'] = { + 'a_primitive#1': { + 'time': 0, + 'memory': 0, + } + } + + returned, debug_returned = mlpipeline.fit(output_='default', debug='tm') + + assert len([returned]) == len(outputs['default']) assert isinstance(debug_returned, dict) assert set(debug_returned.keys()) == set(expected_return.keys()) # fit / produce - assert set(debug_returned["fit"].keys()) == set(expected_return["fit"].keys()) - assert set(debug_returned["produce"].keys()) == set(expected_return["produce"].keys()) + assert set(debug_returned['fit'].keys()) == set(expected_return['fit'].keys()) + assert set(debug_returned['produce'].keys()) == set(expected_return['produce'].keys()) - for block_name, dictionary in expected_return["fit"].items(): - assert set(debug_returned["fit"][block_name].keys()) == set(dictionary.keys()) + for block_name, dictionary in expected_return['fit'].items(): + assert set(debug_returned['fit'][block_name].keys()) == set(dictionary.keys()) - for block_name, dictionary in expected_return["produce"].items(): - assert set(debug_returned["produce"][block_name].keys()) == set(dictionary.keys()) + for block_name, dictionary in expected_return['produce'].items(): + assert set(debug_returned['produce'][block_name].keys()) == set(dictionary.keys()) @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_predict_no_debug(self): @@ -829,9 +923,9 @@ def test_predict_no_debug(self): ] returned = mlpipeline.predict(debug=False) - assert len(returned) == len(outputs["default"]) - for returned_output, expected_output in zip(returned, outputs["default"]): - assert returned_output == expected_output["variable"] + assert len(returned) == len(outputs['default']) + for returned_output, expected_output in zip(returned, outputs['default']): + assert returned_output == expected_output['variable'] @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_predict_debug(self): @@ -861,18 +955,22 @@ def test_predict_debug(self): expected_return = dict() expected_return = { - "a_primitive#1": { - "elapsed": 0, - "input": { - "whatever" + 'a_primitive#1': { + 'time': 0, + 'input': { + 'whatever' }, - "output": { - "whatever" - } + 'output': { + 'whatever' + }, + 'memory': 0 } } + returned, debug_returned = mlpipeline.predict(debug=True) - assert len([returned]) == len(outputs["default"]) + debug_returned = debug_returned['produce'] + + assert len([returned]) == len(outputs['default']) assert isinstance(debug_returned, dict) assert set(debug_returned.keys()) == set(expected_return.keys()) From 9f9c9a14f22e7d2f52e992562a5cc189c0ed12c8 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 22 Dec 2020 16:00:33 +0100 Subject: [PATCH 125/160] =?UTF-8?q?Bump=20version:=200.3.5.dev0=20?= =?UTF-8?q?=E2=86=92=200.4.0.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 300b9093..08618880 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.3.5.dev0' +__version__ = '0.4.0.dev0' __all__ = [ 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index 61208b1f..32db4562 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.5.dev0 +current_version = 0.4.0.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index d76236ae..a929025f 100644 --- a/setup.py +++ b/setup.py @@ -117,6 +117,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.3.5.dev0', + version='0.4.0.dev0', zip_safe=False, ) From 1af7b1bbc617beaab80f453eec01a145e8685032 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 22 Dec 2020 16:00:48 +0100 Subject: [PATCH 126/160] =?UTF-8?q?Bump=20version:=200.4.0.dev0=20?= =?UTF-8?q?=E2=86=92=200.4.0.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 08618880..e3d6fada 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.4.0.dev0' +__version__ = '0.4.0.dev1' __all__ = [ 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path', diff --git a/setup.cfg b/setup.cfg index 32db4562..17998d88 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.0.dev0 +current_version = 0.4.0.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index a929025f..0eab74aa 100644 --- a/setup.py +++ b/setup.py @@ -117,6 +117,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.4.0.dev0', + version='0.4.0.dev1', zip_safe=False, ) From 84460489fc0a0fb2c762f3f16baf4c3e09d5056a Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Fri, 8 Jan 2021 10:16:22 -0500 Subject: [PATCH 127/160] Stop fitting pipeline after last fit block (#132) * initial early stop * change to stop after fitting the last block with attribute * test early-stop calls * remove comment * change to fit pending --- mlblocks/mlpipeline.py | 35 ++++++++++++++++++++--------- tests/test_mlpipeline.py | 48 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 10 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index a4111bcb..d7935757 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -96,6 +96,7 @@ def _get_tunable_hyperparameters(self): def _build_blocks(self): blocks = OrderedDict() + last_fit_block = None block_names_count = Counter() for primitive in self.primitives: @@ -118,11 +119,14 @@ def _build_blocks(self): block = MLBlock(primitive, **block_params) blocks[block_name] = block + if bool(block._fit): + last_fit_block = block_name + except Exception: LOGGER.exception('Exception caught building MLBlock %s', primitive) raise - return blocks + return blocks, last_fit_block @staticmethod def _get_pipeline_dict(pipeline, primitives): @@ -207,7 +211,7 @@ def __init__(self, pipeline=None, primitives=None, init_params=None, self.primitives = primitives or pipeline['primitives'] self.init_params = init_params or pipeline.get('init_params', dict()) - self.blocks = self._build_blocks() + self.blocks, self._last_fit_block = self._build_blocks() self._last_block_name = self._get_block_name(-1) self.input_names = input_names or pipeline.get('input_names', dict()) @@ -767,7 +771,11 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs): debug_info = defaultdict(dict) debug_info['debug'] = debug.lower() if isinstance(debug, str) else 'tmio' + fit_pending = True for block_name, block in self.blocks.items(): + if block_name == self._last_fit_block: + fit_pending = False + if start_: if block_name == start_: start_ = False @@ -777,7 +785,7 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs): self._fit_block(block, block_name, context, debug_info) - if (block_name != self._last_block_name) or (block_name in output_blocks): + if fit_pending or output_blocks: self._produce_block( block, block_name, context, output_variables, outputs, debug_info) @@ -787,16 +795,23 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs): # If there was an output_ but there are no pending # outputs we are done. - if output_variables is not None and not output_blocks: - if len(outputs) > 1: - result = tuple(outputs) - else: - result = outputs[0] + if output_variables: + if not output_blocks: + if len(outputs) > 1: + result = tuple(outputs) + else: + result = outputs[0] + + if debug: + return result, debug_info + + return result + elif not fit_pending: if debug: - return result, debug_info + return debug_info - return result + return if start_: # We skipped all the blocks up to the end diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index 97c59cd0..0ee4cf2c 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -681,6 +681,54 @@ def test_get_inputs_no_fit(self): assert inputs == expected + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_fit_pending_all_primitives(self): + block_1 = get_mlblock_mock() + block_2 = get_mlblock_mock() + blocks = OrderedDict(( + ('a.primitive.Name#1', block_1), + ('a.primitive.Name#2', block_2), + )) + + self_ = MagicMock(autospec=MLPipeline) + self_.blocks = blocks + self_._last_fit_block = 'a.primitive.Name#2' + + MLPipeline.fit(self_) + + expected = [ + call('a.primitive.Name#1'), + call('a.primitive.Name#2') + ] + self_._fit_block.call_args_list = expected + + expected = [ + call('a.primitive.Name#1'), + ] + self_._produce_block.call_args_list = expected + + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_fit_pending_one_primitive(self): + block_1 = get_mlblock_mock() + block_2 = get_mlblock_mock() + blocks = OrderedDict(( + ('a.primitive.Name#1', block_1), + ('a.primitive.Name#2', block_2), + )) + + self_ = MagicMock(autospec=MLPipeline) + self_.blocks = blocks + self_._last_fit_block = 'a.primitive.Name#1' + + MLPipeline.fit(self_) + + expected = [ + call('a.primitive.Name#1'), + ] + self_._fit_block.call_args_list = expected + + assert not self_._produce_block.called + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_fit_no_debug(self): mlpipeline = MLPipeline(['a_primitive']) From 4c2a473c505524e10a850952961a66f35fa41b95 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sat, 9 Jan 2021 16:49:06 +0100 Subject: [PATCH 128/160] Migrate to gh actions (#133) * Cleanup dependencies and migrate to gh-actions * add mlprimitives extra --- .github/workflows/docs.yml | 29 +++++++ .github/workflows/tests.yml | 79 +++++++++++++++++++ .travis.yml | 30 ------- Makefile | 45 +++++++++-- README.md | 21 +++-- apt.txt | 3 + docs/api/mlblocks.datasets.rst | 5 -- docs/api/mlblocks.discovery.rst | 5 -- docs/index.rst | 4 +- docs/pipeline_examples/graph.rst | 2 +- docs/pipeline_examples/image.rst | 6 +- docs/pipeline_examples/multi_table.rst | 2 +- docs/pipeline_examples/single_table.rst | 6 +- docs/pipeline_examples/text.rst | 4 +- ...or the best pipeline with BTBSession.ipynb | 2 +- mlblocks/__init__.py | 16 +++- requirements.txt | 2 + setup.cfg | 1 - setup.py | 23 +++--- tox.ini | 2 + 20 files changed, 197 insertions(+), 90 deletions(-) create mode 100644 .github/workflows/docs.yml create mode 100644 .github/workflows/tests.yml delete mode 100644 .travis.yml create mode 100644 apt.txt delete mode 100644 docs/api/mlblocks.datasets.rst delete mode 100644 docs/api/mlblocks.discovery.rst create mode 100644 requirements.txt diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 00000000..7093b531 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,29 @@ +name: Generate Docs + +on: + push: + branches: [ stable ] + +jobs: + + docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Python + uses: actions/setup-python@v1 + with: + python-version: 3.8 + + - name: Build + run: | + sudo apt-get install graphviz pandoc + python -m pip install --upgrade pip + pip install -e .[dev] + make docs + - name: Deploy + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{secrets.GITHUB_TOKEN}} + publish_dir: docs/_build/html diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 00000000..ea2c37f5 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,79 @@ +name: Run Tests + +on: + push: + branches: [ '*' ] + pull_request: + branches: [ master ] + +jobs: + devel: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.8] + os: [ubuntu-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install package + run: pip install .[dev] + - name: make test-devel + run: make test-devel + + readme: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest, macos-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install package and dependencies + run: pip install rundoc .[mlprimitives] + - name: make test-readme + run: make test-readme + + unit: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest, macos-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install package and dependencies + run: pip install .[test] + - name: make test-unit + run: make test-unit + + tutorials: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - if: matrix.os == 'ubuntu-latest' + name: Install dependencies - Ubuntu + run: sudo apt-get install graphviz + - name: Install package and dependencies + run: pip install .[examples] + - name: make test-tutorials + run: make test-tutorials diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index d2a982f2..00000000 --- a/.travis.yml +++ /dev/null @@ -1,30 +0,0 @@ -# Config file for automatic testing at travis-ci.org -dist: bionic -language: python -python: - - 3.8 - - 3.7 - - 3.6 - -# Command to install dependencies -install: - - sudo apt-get update - - sudo apt-get install graphviz pandoc - - pip install -U tox-travis codecov - -# Command to run tests -script: travis_wait 60 tox - -after_success: codecov - -deploy: - - - provider: pages - skip-cleanup: true - github-token: "$GITHUB_TOKEN" - keep-history: true - local-dir: docs/_build/html - target-branch: gh-pages - on: - branch: master - python: 3.8 diff --git a/Makefile b/Makefile index 6cc80705..c28da455 100644 --- a/Makefile +++ b/Makefile @@ -84,6 +84,12 @@ install-test: clean-build clean-pyc ## install the package and test dependencies install-develop: clean-build clean-pyc ## install the package in editable mode and dependencies for development pip install -e .[dev] +MINIMUM := $(shell sed -n '/install_requires = \[/,/]/p' setup.py | grep -v -e '[][]' | sed 's/ *\(.*\),$?$$/\1/g' | tr '>' '=') + +.PHONY: install-minimum +install-minimum: ## install the minimum supported versions of the package dependencies + pip install $(MINIMUM) + # LINT TARGETS @@ -123,7 +129,7 @@ test-readme: ## run the readme snippets .PHONY: test-tutorials test-tutorials: ## run the tutorial notebooks find examples/tutorials -path "*/.ipynb_checkpoints" -prune -false -o -name "*.ipynb" -exec \ - jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --stdout --to html {} > /dev/null \; + jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --stdout --to html {} > /dev/null + .PHONY: test test: test-unit test-readme ## test everything that needs test dependencies @@ -154,11 +160,11 @@ docs: clean-docs ## generate Sphinx HTML documentation, including API docs $(MAKE) -C docs html .PHONY: view-docs -view-docs: docs ## view docs in browser +view-docs: ## view the docs in a browser $(BROWSER) docs/_build/html/index.html .PHONY: serve-docs -serve-docs: view-docs ## compile the docs watching for changes +serve-docs: ## compile the docs watching for changes watchmedo shell-command -W -R -D -p '*.rst;*.md' -c '$(MAKE) -C docs html' docs @@ -170,12 +176,19 @@ dist: clean ## builds source and wheel package python setup.py bdist_wheel ls -l dist -.PHONY: test-publish -test-publish: dist ## package and upload a release on TestPyPI +.PHONY: publish-confirm +publish-confirm: + @echo "WARNING: This will irreversibly upload a new version to PyPI!" + @echo -n "Please type 'confirm' to proceed: " \ + && read answer \ + && [ "$${answer}" = "confirm" ] + +.PHONY: publish-test +publish-test: dist publish-confirm ## package and upload a release on TestPyPI twine upload --repository-url https://test.pypi.org/legacy/ dist/* .PHONY: publish -publish: dist ## package and upload a release +publish: dist publish-confirm ## package and upload a release twine upload dist/* .PHONY: bumpversion-release @@ -204,9 +217,21 @@ bumpversion-minor: ## Bump the version the next minor skipping the release bumpversion-major: ## Bump the version the next major skipping the release bumpversion --no-tag major +.PHONY: bumpversion-revert +bumpversion-revert: ## Undo a previous bumpversion-release + git checkout master + git branch -D stable + +CLEAN_DIR := $(shell git status --short | grep -v ??) CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l) +.PHONY: check-clean +check-clean: ## Check if the directory has uncommitted changes +ifneq ($(CLEAN_DIR),) + $(error There are uncommitted changes) +endif + .PHONY: check-master check-master: ## Check if we are in master branch ifneq ($(CURRENT_BRANCH),master) @@ -220,15 +245,21 @@ ifeq ($(CHANGELOG_LINES),0) endif .PHONY: check-release -check-release: check-master check-history ## Check if the release can be made +check-release: check-clean check-master check-history ## Check if the release can be made @echo "A new release can be made" .PHONY: release release: check-release bumpversion-release publish bumpversion-patch +.PHONY: release-test +release-test: check-release bumpversion-release-test publish-test bumpversion-revert + .PHONY: release-candidate release-candidate: check-master publish bumpversion-candidate +.PHONY: release-candidate-test +release-candidate-test: check-clean check-master publish-test + .PHONY: release-minor release-minor: check-release bumpversion-minor release diff --git a/README.md b/README.md index 103fc113..4da013b0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@

-“DAI-Lab” -An open source project from Data to AI Lab at MIT. + + DAI-Lab + + An Open Source Project from the Data to AI Lab, at MIT

@@ -13,18 +15,19 @@ Pipelines and Primitives for Machine Learning and Data Science. [![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) [![PyPi](https://img.shields.io/pypi/v/mlblocks.svg)](https://pypi.python.org/pypi/mlblocks) -[![Travis](https://travis-ci.com/MLBazaar/MLBlocks.svg?branch=master)](https://travis-ci.com/MLBazaar/MLBlocks) +[![Tests](https://github.com/MLBazaar/MLBlocks/workflows/Run%20Tests/badge.svg)](https://github.com/MLBazaar/MLBlocks/actions?query=workflow%3A%22Run+Tests%22+branch%3Amaster) [![CodeCov](https://codecov.io/gh/MLBazaar/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/MLBazaar/MLBlocks) [![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks) +[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/MLBazaar/MLBlocks/master?filepath=examples/tutorials)
# MLBlocks -* Free software: [MIT license](https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE) -* Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) * Documentation: https://mlbazaar.github.io/MLBlocks -* Homepage: https://github.com/MLBazaar/MLBlocks +* Github: https://github.com/MLBazaar/MLBlocks +* License: [MIT](https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE) +* Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) ## Overview @@ -49,11 +52,7 @@ Features include: **MLBlocks** has been developed and tested on [Python 3.6, 3.7 and 3.8](https://www.python.org/downloads/) -Also, although it is not strictly required, the usage of a -[virtualenv](https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid -interfering with other software installed in the system where **MLBlocks** is run. - -## Install with pip +## Install with `pip` The easiest and recommended way to install **MLBlocks** is using [pip]( https://pip.pypa.io/en/stable/): diff --git a/apt.txt b/apt.txt new file mode 100644 index 00000000..65387721 --- /dev/null +++ b/apt.txt @@ -0,0 +1,3 @@ +# apt-get requirements for development and mybinder environment +graphviz +pandoc diff --git a/docs/api/mlblocks.datasets.rst b/docs/api/mlblocks.datasets.rst deleted file mode 100644 index 6661cd8a..00000000 --- a/docs/api/mlblocks.datasets.rst +++ /dev/null @@ -1,5 +0,0 @@ -mlblocks.datasets -================= - -.. automodule:: mlblocks.datasets - :members: diff --git a/docs/api/mlblocks.discovery.rst b/docs/api/mlblocks.discovery.rst deleted file mode 100644 index c9109130..00000000 --- a/docs/api/mlblocks.discovery.rst +++ /dev/null @@ -1,5 +0,0 @@ -mlblocks.discovery -================== - -.. automodule:: mlblocks.discovery - :members: diff --git a/docs/index.rst b/docs/index.rst index 85717469..25567005 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,9 +6,9 @@ What is MLBlocks? :alt: MLBlocks :align: center -* Free software: `MIT license `_ * Documentation: https://mlbazaar.github.io/MLBlocks -* Homepage: https://github.com/MLBazaar/MLBlocks +* Github: https://github.com/MLBazaar/MLBlocks +* License: `MIT `_ MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning tools developed in Python, whether they are custom developments or belong to third party diff --git a/docs/pipeline_examples/graph.rst b/docs/pipeline_examples/graph.rst index 8cde5340..082d12b6 100644 --- a/docs/pipeline_examples/graph.rst +++ b/docs/pipeline_examples/graph.rst @@ -30,7 +30,7 @@ additional information not found inside `X`. .. code-block:: python from mlblocks import MLPipeline - from mlblocks.datasets import load_umls + from mlprimitives.datasets import load_umls dataset = load_umls() dataset.describe() diff --git a/docs/pipeline_examples/image.rst b/docs/pipeline_examples/image.rst index b9b97ef7..e892f915 100644 --- a/docs/pipeline_examples/image.rst +++ b/docs/pipeline_examples/image.rst @@ -24,7 +24,7 @@ Gradients using the corresponding `scikit-image function`_ to later on use a sim .. code-block:: python from mlblocks import MLPipeline - from mlblocks.datasets import load_usps + from mlprimitives.datasets import load_usps dataset = load_usps() dataset.describe() @@ -61,7 +61,7 @@ and directly after go into a Single Layer CNN Classifier built on Keras using th .. code-block:: python from mlblocks import MLPipeline - from mlblocks.datasets import load_usps + from mlprimitives.datasets import load_usps dataset = load_usps() dataset.describe() @@ -107,7 +107,7 @@ to an `XGBRegressor`_ primitive. .. code-block:: python from mlblocks import MLPipeline - from mlblocks.datasets import load_handgeometry + from mlprimitives.datasets import load_handgeometry dataset = load_handgeometry() dataset.describe() diff --git a/docs/pipeline_examples/multi_table.rst b/docs/pipeline_examples/multi_table.rst index c2c2066f..7091a374 100644 --- a/docs/pipeline_examples/multi_table.rst +++ b/docs/pipeline_examples/multi_table.rst @@ -25,7 +25,7 @@ tables are. .. code-block:: python from mlblocks import MLPipeline - from mlblocks.datasets import load_wikiqa + from mlprimitives.datasets import load_wikiqa dataset = load_wikiqa() dataset.describe() diff --git a/docs/pipeline_examples/single_table.rst b/docs/pipeline_examples/single_table.rst index ee00d9c6..6a031cb1 100644 --- a/docs/pipeline_examples/single_table.rst +++ b/docs/pipeline_examples/single_table.rst @@ -5,7 +5,7 @@ In this section we will go over a few pipeline examples to show **MLBlocks** wor in different scenarios and with different types of data. For each example, we will be using example datasets which can be downloaded using the -various functions found in the ``mlblocks.datasets`` module. +various functions found in the ``mlprimitives.datasets`` module. .. note:: Even though the datasets are not especially big, some of the examples might use a considerable amount of resources, especially memory, and might take @@ -21,7 +21,7 @@ the numeric data from `The Boston Dataset`_, which we will load using the .. code-block:: python from mlblocks import MLPipeline - from mlblocks.datasets import load_boston + from mlprimitives.datasets import load_boston dataset = load_boston() dataset.describe() @@ -52,7 +52,7 @@ In this case, we will also be passing some initialization parameters for the XGB .. code-block:: python from mlblocks import MLPipeline - from mlblocks.datasets import load_iris + from mlprimitives.datasets import load_iris dataset = load_iris() dataset.describe() diff --git a/docs/pipeline_examples/text.rst b/docs/pipeline_examples/text.rst index ee0c16ac..75ca3f39 100644 --- a/docs/pipeline_examples/text.rst +++ b/docs/pipeline_examples/text.rst @@ -28,7 +28,7 @@ for later ones. import nltk from mlblocks import MLPipeline - from mlblocks.datasets import load_newsgroups + from mlprimitives.datasets import load_newsgroups dataset = load_newsgroups() dataset.describe() @@ -105,7 +105,7 @@ to encode all the string features, and go directly into the import nltk from mlblocks import MLPipeline - from mlblocks.datasets import load_personae + from mlprimitives.datasets import load_personae dataset = load_personae() dataset.describe() diff --git a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb index 829a38d6..44431d4f 100644 --- a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb +++ b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb @@ -616,7 +616,7 @@ } ], "source": [ - "session.run(20)" + "session.run(10)" ] }, { diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index e3d6fada..8e4e6537 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -11,8 +11,8 @@ """ from mlblocks.discovery import ( - add_pipelines_path, add_primitives_path, get_pipelines_paths, get_primitives_paths, - load_pipeline, load_primitive) + add_pipelines_path, add_primitives_path, find_pipelines, find_primitives, get_pipelines_paths, + get_primitives_paths, load_pipeline, load_primitive) from mlblocks.mlblock import MLBlock from mlblocks.mlpipeline import MLPipeline @@ -23,6 +23,14 @@ __version__ = '0.4.0.dev1' __all__ = [ - 'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path', - 'get_pipelines_paths', 'get_primitives_paths', 'load_pipeline', 'load_primitive' + 'MLBlock', + 'MLPipeline', + 'add_pipelines_path', + 'add_primitives_path', + 'find_pipelines', + 'find_primitives', + 'get_pipelines_paths', + 'get_primitives_paths', + 'load_pipeline', + 'load_primitive' ] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..3b01f6bf --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +# Requirements for development and mybinder environment +-e .[dev] diff --git a/setup.cfg b/setup.cfg index 17998d88..969e1d64 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,7 +33,6 @@ exclude = .tox, .git, __pycache__, .ipynb_checkpoints ignore = # Keep empty to prevent default ignores [isort] -include_trailing_comment = True line_length = 99 lines_between_types = 0 multi_line_output = 4 diff --git a/setup.py b/setup.py index 0eab74aa..c9068f3a 100644 --- a/setup.py +++ b/setup.py @@ -5,12 +5,10 @@ from setuptools import find_packages, setup - -with open('README.md') as readme_file: +with open('README.md', encoding='utf-8') as readme_file: readme = readme_file.read() - -with open('HISTORY.md') as history_file: +with open('HISTORY.md', encoding='utf-8') as history_file: history = history_file.read() @@ -21,13 +19,12 @@ ] -examples_require = [ - 'matplotlib>=2.2.2,<3.2.2', +mlprimitives_requires = [ 'mlprimitives>=0.3.0.dev0,<0.4', - 'boto3>=1.14,<1.14.45', - 'botocore<1.17.45,>=1.17.44', +] + +examples_require = mlprimitives_requires + [ 'jupyter==1.0.0', - 'docutils<0.16,>=0.10', 'baytune>=0.3.13.dev0,<0.4', ] @@ -79,9 +76,6 @@ # Documentation style 'doc8>=0.8.0', 'pydocstyle>=3.0.0', - - # Prevent travis-ci conflict - 'chardet<4', ] @@ -98,16 +92,17 @@ 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', ], - description="Pipelines and primitives for machine learning and data science.", + description='Pipelines and primitives for machine learning and data science.', extras_require={ 'dev': development_requires + tests_require + examples_require, 'test': tests_require + examples_require, 'examples': examples_require, + 'mlprimitives': mlprimitives_requires, }, include_package_data=True, install_requires=install_requires, keywords='auto machine learning classification regression data science pipeline', - license="MIT license", + license='MIT license', long_description=readme + '\n\n' + history, long_description_content_type='text/markdown', name='mlblocks', diff --git a/tox.ini b/tox.ini index 1bc3f81a..229c1d54 100644 --- a/tox.ini +++ b/tox.ini @@ -14,8 +14,10 @@ skip_install = false extras = test commands = /usr/bin/env make test + rm -r {envdir} [testenv:test-devel] extras = dev commands = /usr/bin/env make test-devel + rm -r {envdir} From 6dbcda49319047b1dcf339f5c00b830b61a8ed29 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sat, 9 Jan 2021 17:19:37 +0100 Subject: [PATCH 129/160] Fix dependency conflicts --- requirements.txt | 1 + setup.py | 6 ++++-- tox.ini | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3b01f6bf..d2ce3888 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ # Requirements for development and mybinder environment -e .[dev] +docutils<0.16,>=0.10 # Fix dependency conflict on mybinder diff --git a/setup.py b/setup.py index c9068f3a..91edced6 100644 --- a/setup.py +++ b/setup.py @@ -20,12 +20,14 @@ mlprimitives_requires = [ - 'mlprimitives>=0.3.0.dev0,<0.4', + 'mlprimitives>=0.3.0,<0.4', + 'h5py<2.11.0,>=2.10.0', # <- tensorflow 2.3.2 conflict + 'matplotlib<3.2.2,>=2.2.2', # <- copulas 0.3.3 ] examples_require = mlprimitives_requires + [ 'jupyter==1.0.0', - 'baytune>=0.3.13.dev0,<0.4', + 'baytune>=0.4.0,<0.5', ] diff --git a/tox.ini b/tox.ini index 229c1d54..e38f071b 100644 --- a/tox.ini +++ b/tox.ini @@ -9,6 +9,7 @@ python = [testenv] passenv = CI TRAVIS TRAVIS_* +allowlist_externals = rm skipsdist = false skip_install = false extras = test From 2c1e9a3f83bcb937a630b440dbbcef83db4eff4d Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sat, 9 Jan 2021 17:20:33 +0100 Subject: [PATCH 130/160] Add release notes for v0.4.0 --- HISTORY.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 17bbda92..da082c25 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,17 @@ Changelog ========= +0.4.0 - 2021-01-09 +------------------ + +* Stop pipeline fitting after the last block - [Issue #131](https://github.com/MLBazaar/MLBlocks/issues/131) by @sarahmish +* Add memory debug and profiling - [Issue #130](https://github.com/MLBazaar/MLBlocks/issues/130) by @pvk-developer +* Update Python support - [Issue #129](https://github.com/MLBazaar/MLBlocks/issues/129) by @csala +* Get execution time for each block - [Issue #127](https://github.com/MLBazaar/MLBlocks/issues/127) by @sarahmish +* Allow loading a primitive or pipeline directly from the JSON path - [Issue #114](https://github.com/MLBazaar/MLBlocks/issues/114) by @csala +* Pipeline Diagrams - [Issue #113](https://github.com/MLBazaar/MLBlocks/issues/113) by @erica-chiu +* Get Pipeline Inputs - [Issue #112](https://github.com/MLBazaar/MLBlocks/issues/112) by @erica-chiu + 0.3.4 - 2019-11-01 ------------------ From 04bb5fc72f55a9e2f439bed2d4ec3ae6537f52f4 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sat, 9 Jan 2021 17:22:42 +0100 Subject: [PATCH 131/160] =?UTF-8?q?Bump=20version:=200.4.0.dev1=20?= =?UTF-8?q?=E2=86=92=200.4.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 8e4e6537..28a80c5d 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.4.0.dev1' +__version__ = '0.4.0' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index 969e1d64..dc027074 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.0.dev1 +current_version = 0.4.0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 91edced6..0c05c20b 100644 --- a/setup.py +++ b/setup.py @@ -114,6 +114,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.4.0.dev1', + version='0.4.0', zip_safe=False, ) From ae9653bfd0ae3e9798071d8bec311cee4e396804 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sat, 9 Jan 2021 17:22:55 +0100 Subject: [PATCH 132/160] =?UTF-8?q?Bump=20version:=200.4.0=20=E2=86=92=200?= =?UTF-8?q?.4.1.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 28a80c5d..61438750 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.4.0' +__version__ = '0.4.1.dev0' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index dc027074..96b72ce1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.0 +current_version = 0.4.1.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 0c05c20b..db8f5aa6 100644 --- a/setup.py +++ b/setup.py @@ -114,6 +114,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.4.0', + version='0.4.1.dev0', zip_safe=False, ) From 098302e83d17d05425bf546077805738abeaebc7 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev <41479552+pvk-developer@users.noreply.github.com> Date: Mon, 8 Mar 2021 16:27:05 +0100 Subject: [PATCH 133/160] Implement dynamic inputs and outputs. (#135) * Implement dynamic inputs and outputs. * Recover block_outputs if it's a string from the block's instance. * Update tests --- mlblocks/mlblock.py | 13 +++- mlblocks/mlpipeline.py | 16 +++++ tests/test_mlpipeline.py | 142 ++++++++++++++++++++++++++++++--------- 3 files changed, 136 insertions(+), 35 deletions(-) diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index f570165b..d2295722 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -111,8 +111,15 @@ def _extract_params(self, kwargs, hyperparameters): if name in kwargs: init_params[name] = kwargs.pop(name) - fit_args = [arg['name'] for arg in self.fit_args] - produce_args = [arg['name'] for arg in self.produce_args] + if not isinstance(self.fit_args, str): + fit_args = [arg['name'] for arg in self.fit_args] + else: + fit_args = [] + + if not isinstance(self.produce_args, str): + produce_args = [arg['name'] for arg in self.produce_args] + else: + produce_args = [] for name in list(kwargs.keys()): if name in fit_args: @@ -257,6 +264,8 @@ def _get_method_kwargs(self, kwargs, method_args): A dictionary containing the argument names and values to pass to the primitive method. """ + if isinstance(method_args, str): + method_args = getattr(self.instance, method_args)() method_kwargs = dict() for arg in method_args: diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index d7935757..738b13b0 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -177,6 +177,9 @@ def _get_block_variables(self, block_name, variables_attr, names): """ block = self.blocks[block_name] variables = deepcopy(getattr(block, variables_attr)) + if isinstance(variables, str): + variables = getattr(block.instance, variables)() + variable_dict = {} for variable in variables: name = variable['name'] @@ -300,6 +303,12 @@ def get_inputs(self, fit=True): return inputs + def get_fit_args(self): + return list(self.get_inputs(fit=True).values()) + + def get_predict_args(self): + return list(self.get_inputs(fit=False).values()) + def get_outputs(self, outputs='default'): """Get the list of output variables that correspond to the specified outputs. @@ -578,6 +587,10 @@ def _get_block_args(self, block_name, block_args, context): input_names = self.input_names.get(block_name, dict()) + if isinstance(block_args, str): + block = self.blocks[block_name] + block_args = getattr(block.instance, block_args)() + kwargs = dict() for arg in block_args: name = arg['name'] @@ -591,6 +604,9 @@ def _get_block_args(self, block_name, block_args, context): def _extract_outputs(self, block_name, outputs, block_outputs): """Extract the outputs of the method as a dict to be set into the context.""" # TODO: type validation and/or transformation should be done here + if isinstance(block_outputs, str): + block = self.blocks[block_name] + block_outputs = getattr(block.instance, block_outputs)() if not isinstance(outputs, tuple): outputs = (outputs, ) diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index 0ee4cf2c..be8c6f6b 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -381,6 +381,7 @@ def test_get_outputs_str_named(self): ] } pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs) + returned = pipeline.get_outputs('debug') expected = [ @@ -389,13 +390,11 @@ def test_get_outputs_str_named(self): 'variable': 'another_variable', } ] - assert returned == expected @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_outputs_str_variable(self): pipeline = MLPipeline(['a_primitive', 'another_primitive']) - pipeline.blocks['a_primitive#1'].produce_output = [ { 'name': 'output', @@ -412,7 +411,6 @@ def test_get_outputs_str_variable(self): 'variable': 'a_primitive#1.output' } ] - assert returned == expected @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) @@ -427,7 +425,6 @@ def test_get_outputs_str_block(self): 'variable': 'a_primitive#1', } ] - assert returned == expected @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) @@ -442,7 +439,6 @@ def test_get_outputs_int(self): 'variable': 'another_primitive#1', } ] - assert returned == expected @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) @@ -463,7 +459,6 @@ def test_get_outputs_combination(self): ] } pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs) - pipeline.blocks['a_primitive#1'].produce_output = [ { 'name': 'output', @@ -498,7 +493,6 @@ def test_get_outputs_combination(self): 'variable': 'a_primitive#1.output' } ] - assert returned == expected @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) @@ -550,21 +544,39 @@ def test_get_output_variables(self): assert names == ['a_variable'] @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) - def test__get_block_variables(self): + def test__get_block_variables_is_dict(self): + pipeline = MLPipeline(['a_primitive']) + pipeline.blocks['a_primitive#1'].produce_outputs = [ + { + 'name': 'output', + 'type': 'whatever' + } + ] + + outputs = pipeline._get_block_variables( + 'a_primitive#1', + 'produce_outputs', + {'output': 'name_output'} + ) + expected = { 'name_output': { 'name': 'output', 'type': 'whatever', } } + assert outputs == expected + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test__get_block_variables_is_str(self): pipeline = MLPipeline(['a_primitive']) - - pipeline.blocks['a_primitive#1'].produce_outputs = [ + pipeline.blocks['a_primitive#1'].produce_outputs = 'get_produce_outputs' + pipeline.blocks['a_primitive#1'].instance.get_produce_outputs.return_value = [ { - 'name': 'output', - 'type': 'whatever' + 'name': 'output_from_function', + 'type': 'test' } + ] outputs = pipeline._get_block_variables( @@ -572,10 +584,50 @@ def test__get_block_variables(self): 'produce_outputs', {'output': 'name_output'} ) + + expected = { + 'output_from_function': { + 'name': 'output_from_function', + 'type': 'test', + } + } assert outputs == expected + pipeline.blocks['a_primitive#1'].instance.get_produce_outputs.assert_called_once_with() @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_inputs_fit(self): + pipeline = MLPipeline(['a_primitive', 'another_primitive']) + pipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'output', + 'type': 'another_whatever' + } + ] + pipeline.blocks['another_primitive#1'].produce_args = [ + { + 'name': 'output', + 'type': 'another_whatever' + }, + { + 'name': 'another_input', + 'type': 'another_whatever' + } + ] + + inputs = pipeline.get_inputs() + expected = { 'input': { 'name': 'input', @@ -589,32 +641,30 @@ def test_get_inputs_fit(self): 'name': 'another_input', 'type': 'another_whatever', } - } + assert inputs == expected + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_inputs_no_fit(self): pipeline = MLPipeline(['a_primitive', 'another_primitive']) - pipeline.blocks['a_primitive#1'].produce_args = [ { 'name': 'input', 'type': 'whatever' } ] - pipeline.blocks['a_primitive#1'].fit_args = [ { 'name': 'fit_input', 'type': 'whatever' } ] - pipeline.blocks['a_primitive#1'].produce_output = [ { 'name': 'output', 'type': 'another_whatever' } ] - pipeline.blocks['another_primitive#1'].produce_args = [ { 'name': 'output', @@ -626,11 +676,8 @@ def test_get_inputs_fit(self): } ] - inputs = pipeline.get_inputs() - assert inputs == expected + inputs = pipeline.get_inputs(fit=False) - @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) - def test_get_inputs_no_fit(self): expected = { 'input': { 'name': 'input', @@ -640,25 +687,24 @@ def test_get_inputs_no_fit(self): 'name': 'another_input', 'type': 'another_whatever', } - } + assert inputs == expected - pipeline = MLPipeline(['a_primitive', 'another_primitive']) - + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_fit_args(self): + pipeline = MLPipeline(['a_primitive']) pipeline.blocks['a_primitive#1'].produce_args = [ { 'name': 'input', 'type': 'whatever' } ] - pipeline.blocks['a_primitive#1'].fit_args = [ { 'name': 'fit_input', 'type': 'whatever' } ] - pipeline.blocks['a_primitive#1'].produce_output = [ { 'name': 'output', @@ -666,20 +712,50 @@ def test_get_inputs_no_fit(self): } ] - pipeline.blocks['another_primitive#1'].produce_args = [ + outputs = pipeline.get_fit_args() + + expected = [ { - 'name': 'output', - 'type': 'another_whatever' + 'name': 'input', + 'type': 'whatever' }, { - 'name': 'another_input', - 'type': 'another_whatever' + 'name': 'fit_input', + 'type': 'whatever', } ] + assert outputs == expected - inputs = pipeline.get_inputs(fit=False) + @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) + def test_get_predict_args(self): + pipeline = MLPipeline(['a_primitive']) + pipeline.blocks['a_primitive#1'].produce_args = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].fit_args = [ + { + 'name': 'fit_input', + 'type': 'whatever' + } + ] + pipeline.blocks['a_primitive#1'].produce_output = [ + { + 'name': 'output', + 'type': 'another_whatever' + } + ] + outputs = pipeline.get_predict_args() - assert inputs == expected + expected = [ + { + 'name': 'input', + 'type': 'whatever' + } + ] + assert outputs == expected @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_fit_pending_all_primitives(self): From 286e0f207d569eff4d2b1a52aeb128965a5372a7 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Mon, 8 Mar 2021 18:08:19 +0100 Subject: [PATCH 134/160] =?UTF-8?q?Bump=20version:=200.4.1.dev0=20?= =?UTF-8?q?=E2=86=92=200.4.1.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 61438750..5e8d665e 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.4.1.dev0' +__version__ = '0.4.1.dev1' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index 96b72ce1..e75ffe48 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.1.dev0 +current_version = 0.4.1.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index db8f5aa6..a48b031f 100644 --- a/setup.py +++ b/setup.py @@ -114,6 +114,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.4.1.dev0', + version='0.4.1.dev1', zip_safe=False, ) From ae1cdd66a10bb0e6341ab716e1fdb7ca7fc51bae Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Mon, 27 Sep 2021 10:38:04 -0400 Subject: [PATCH 135/160] Update dependencies (#136) * Increase numpy cap --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a48b031f..78f4053a 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ install_requires = [ 'graphviz>=0.9,<1', - 'numpy>=1.17.1,<1.19', + 'numpy>=1.17.1,<1.21', 'psutil>=5,<6', ] From 3585628764bcb0bb2e06348eed4a90da5df3d4df Mon Sep 17 00:00:00 2001 From: sarahmish Date: Fri, 8 Oct 2021 10:55:56 -0400 Subject: [PATCH 136/160] =?UTF-8?q?Bump=20version:=200.4.1.dev1=20?= =?UTF-8?q?=E2=86=92=200.4.1.dev2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 5e8d665e..f3ead991 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.4.1.dev1' +__version__ = '0.4.1.dev2' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index e75ffe48..b106c1e6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.1.dev1 +current_version = 0.4.1.dev2 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 78f4053a..6a193b32 100644 --- a/setup.py +++ b/setup.py @@ -114,6 +114,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.4.1.dev1', + version='0.4.1.dev2', zip_safe=False, ) From e8d353da3bf2585d4cbed40f07dda93529690196 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Fri, 8 Oct 2021 12:06:57 -0400 Subject: [PATCH 137/160] prepare release notes --- HISTORY.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index da082c25..0575c034 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,12 @@ Changelog ========= +0.4.1 - 2021-10-08 +------------------ + +* Update NumPy dependency - [Issue #136](https://github.com/MLBazaar/MLBlocks/issues/136) by @sarahmish +* Support dynamic inputs and outputs - [Issue #134](https://github.com/MLBazaar/MLBlocks/issues/134) by @pvk-developer + 0.4.0 - 2021-01-09 ------------------ From 16ba53c557a770760bb46fbf17566891a258cdb3 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Fri, 8 Oct 2021 12:07:26 -0400 Subject: [PATCH 138/160] =?UTF-8?q?Bump=20version:=200.4.1.dev2=20?= =?UTF-8?q?=E2=86=92=200.4.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index f3ead991..9c9d5d13 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.4.1.dev2' +__version__ = '0.4.1' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index b106c1e6..84f59fab 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.1.dev2 +current_version = 0.4.1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 6a193b32..b7c717be 100644 --- a/setup.py +++ b/setup.py @@ -114,6 +114,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.4.1.dev2', + version='0.4.1', zip_safe=False, ) From 515d0a7af4e6466014333eace818d3a64a2ce46b Mon Sep 17 00:00:00 2001 From: sarahmish Date: Fri, 8 Oct 2021 12:07:41 -0400 Subject: [PATCH 139/160] =?UTF-8?q?Bump=20version:=200.4.1=20=E2=86=92=200?= =?UTF-8?q?.4.2.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 9c9d5d13..9c42ed1a 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.4.1' +__version__ = '0.4.2.dev0' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index 84f59fab..fc9e4e12 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.1 +current_version = 0.4.2.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index b7c717be..c0432aa4 100644 --- a/setup.py +++ b/setup.py @@ -114,6 +114,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.4.1', + version='0.4.2.dev0', zip_safe=False, ) From 79fc8fbc4632f164102c4973badd13cd38c31e84 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Sun, 11 Dec 2022 16:12:08 -0600 Subject: [PATCH 140/160] Update `numpy` dependency (#139) * push numpy cap * add separate tests for mlblocks * fix command * create new unit test environment * pin jinja2 * pin markupsafe * add docutils * pin scikit learn for docs * unpin scikit-learn and add okwarning --- .github/workflows/tests.yml | 35 +++++++++++++++++++++-------- Makefile | 6 ++++- docs/getting_started/quickstart.rst | 2 ++ setup.py | 8 +++++-- tests/test_mlpipeline.py | 6 ++--- 5 files changed, 42 insertions(+), 15 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ea2c37f5..4cb525ed 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,7 +16,7 @@ jobs: steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install package @@ -29,11 +29,11 @@ jobs: strategy: matrix: python-version: [3.6, 3.7, 3.8] - os: [ubuntu-latest, macos-latest] + os: [ubuntu-20.04, macos-latest] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install package and dependencies @@ -46,31 +46,48 @@ jobs: strategy: matrix: python-version: [3.6, 3.7, 3.8] - os: [ubuntu-latest, macos-latest] + os: [ubuntu-20.04, macos-latest] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install package and dependencies - run: pip install .[test] + run: pip install .[unit] - name: make test-unit run: make test-unit + unit-mlprimitives: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-20.04, macos-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install package and dependencies + run: pip install .[test] + - name: make test-mlprimitives + run: make test-mlprimitives + tutorials: runs-on: ${{ matrix.os }} strategy: matrix: python-version: [3.6, 3.7, 3.8] - os: [ubuntu-latest] + os: [ubuntu-20.04] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - - if: matrix.os == 'ubuntu-latest' + - if: matrix.os == 'ubuntu-20.04' name: Install dependencies - Ubuntu run: sudo apt-get install graphviz - name: Install package and dependencies diff --git a/Makefile b/Makefile index c28da455..2ae6c7c3 100644 --- a/Makefile +++ b/Makefile @@ -118,6 +118,10 @@ lint-docs: ## check docs formatting with doc8 and pydocstyle .PHONY: test-unit test-unit: ## run tests quickly with the default Python + python -m pytest --cov=mlblocks --ignore=tests/features/ + +.PHONY: test-mlprimitives +test-mlprimitives: ## run tests quickly with the default Python python -m pytest --cov=mlblocks .PHONY: test-readme @@ -132,7 +136,7 @@ test-tutorials: ## run the tutorial notebooks jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --stdout --to html {} > /dev/null + .PHONY: test -test: test-unit test-readme ## test everything that needs test dependencies +test: test-unit test-mlprimitives test-readme ## test everything that needs test dependencies .PHONY: check-dependencies check-dependencies: ## test if there are any broken dependencies diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst index 386752dc..f0cb9a3f 100644 --- a/docs/getting_started/quickstart.rst +++ b/docs/getting_started/quickstart.rst @@ -102,6 +102,7 @@ To do this, we first call the ``fit`` method passing the training data and the c labels. .. ipython:: python + :okwarning: from mlprimitives.datasets import load_census dataset = load_census() @@ -112,6 +113,7 @@ Once we have fitted our model to our data, we can call the ``predict`` method pa to obtain predictions from the pipeline. .. ipython:: python + :okwarning: predictions = pipeline.predict(X_test) predictions diff --git a/setup.py b/setup.py index c0432aa4..85b05bcd 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ install_requires = [ 'graphviz>=0.9,<1', - 'numpy>=1.17.1,<1.21', + 'numpy>=1.17.1,<2', 'psutil>=5,<6', ] @@ -23,6 +23,7 @@ 'mlprimitives>=0.3.0,<0.4', 'h5py<2.11.0,>=2.10.0', # <- tensorflow 2.3.2 conflict 'matplotlib<3.2.2,>=2.2.2', # <- copulas 0.3.3 + 'protobuf<4', # <- importlib ] examples_require = mlprimitives_requires + [ @@ -34,7 +35,6 @@ tests_require = [ 'pytest>=3.4.2', 'pytest-cov>=2.6.0', - 'mlprimitives>=0.3.0.dev0,<0.4', 'setuptools>=41.0.0', 'rundoc>=0.4.3', 'prompt-toolkit>=2.0,<3.0', @@ -56,8 +56,11 @@ 'm2r>=0.2.0,<0.3', 'Sphinx>=1.7.1,<3', 'sphinx_rtd_theme>=0.2.4,<0.5', + 'docutils>=0.12,<0.18', 'ipython>=6.5.0', 'autodocsumm>=0.1.10', + 'Jinja2>=2,<3', # >=3 makes sphinx theme fail + 'markupsafe<2.1.0', # style check 'flake8>=3.7.7,<4', @@ -97,6 +100,7 @@ description='Pipelines and primitives for machine learning and data science.', extras_require={ 'dev': development_requires + tests_require + examples_require, + 'unit': tests_require, 'test': tests_require + examples_require, 'examples': examples_require, 'mlprimitives': mlprimitives_requires, diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py index be8c6f6b..084eac3d 100644 --- a/tests/test_mlpipeline.py +++ b/tests/test_mlpipeline.py @@ -1124,7 +1124,7 @@ def test_get_diagram_simple(self): ] pipeline.blocks['a_primitive#1'].produce_output = output - assert str(pipeline.get_diagram()) == expected + assert str(pipeline.get_diagram()).strip() == expected.strip() @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_diagram_fit(self): @@ -1155,7 +1155,7 @@ def test_get_diagram_fit(self): ] pipeline.blocks['a_primitive#1'].produce_output = output - assert str(pipeline.get_diagram()) == expected + assert str(pipeline.get_diagram()).strip() == expected.strip() @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock) def test_get_diagram_multiple_blocks(self): @@ -1189,7 +1189,7 @@ def test_get_diagram_multiple_blocks(self): pipeline.blocks['b_primitive#1'].produce_args = first_output pipeline.blocks['b_primitive#1'].produce_output = second_output - assert str(pipeline.get_diagram()) == expected + assert str(pipeline.get_diagram()).strip() == expected.strip() def test_fit(self): pass From 40c5c413dc62cd1e38b6fa8e40fc858b6ac54479 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Sun, 22 Jan 2023 16:25:44 -0500 Subject: [PATCH 141/160] add release notes --- HISTORY.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 0575c034..c183b575 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,11 @@ Changelog ========= +0.5.0 - 2023-01-22 +------------------ + +* Update `numpy` dependency and isolate tests - [Issue #139](https://github.com/MLBazaar/MLBlocks/issues/139) by @sarahmish + 0.4.1 - 2021-10-08 ------------------ From a4ba9c4e588d88b95797117e2562100bb76e6def Mon Sep 17 00:00:00 2001 From: sarahmish Date: Sun, 22 Jan 2023 16:26:03 -0500 Subject: [PATCH 142/160] =?UTF-8?q?Bump=20version:=200.4.2.dev0=20?= =?UTF-8?q?=E2=86=92=200.5.0.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 9c42ed1a..82a61ca3 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.4.2.dev0' +__version__ = '0.5.0.dev0' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index fc9e4e12..d21c7a1a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.2.dev0 +current_version = 0.5.0.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 85b05bcd..4926b10a 100644 --- a/setup.py +++ b/setup.py @@ -118,6 +118,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.4.2.dev0', + version='0.5.0.dev0', zip_safe=False, ) From 8140e3dcfe017e2a1e04ada9c6783f2dcdf30198 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Sun, 22 Jan 2023 16:26:03 -0500 Subject: [PATCH 143/160] =?UTF-8?q?Bump=20version:=200.5.0.dev0=20?= =?UTF-8?q?=E2=86=92=200.5.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 82a61ca3..7cc2da30 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.5.0.dev0' +__version__ = '0.5.0' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index d21c7a1a..746b4d2f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.5.0.dev0 +current_version = 0.5.0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 4926b10a..8b11e6ff 100644 --- a/setup.py +++ b/setup.py @@ -118,6 +118,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.5.0.dev0', + version='0.5.0', zip_safe=False, ) From a70b30713416ca1bc1a4cf2c2675cda383e28ca8 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Sun, 22 Jan 2023 16:26:18 -0500 Subject: [PATCH 144/160] =?UTF-8?q?Bump=20version:=200.5.0=20=E2=86=92=200?= =?UTF-8?q?.5.1.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 7cc2da30..3e7aa671 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.5.0' +__version__ = '0.5.1.dev0' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index 746b4d2f..70204a8c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.5.0 +current_version = 0.5.1.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 8b11e6ff..70d599ea 100644 --- a/setup.py +++ b/setup.py @@ -118,6 +118,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.5.0', + version='0.5.1.dev0', zip_safe=False, ) From c74137e6a52c141d2bc10bb8b11de5b72e83ea07 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Thu, 13 Apr 2023 14:17:52 -0400 Subject: [PATCH 145/160] Upgrade python (#142) * add python 3.9 and 3.10 * fix python specification * update python version in readme --- .github/workflows/tests.yml | 2 +- Makefile | 4 ++++ README.md | 2 +- setup.py | 4 +++- tox.ini | 4 +++- 5 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4cb525ed..3f46f728 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -45,7 +45,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: ['3.6', '3.7', '3.8', '3.9', '3.10'] os: [ubuntu-20.04, macos-latest] steps: - uses: actions/checkout@v1 diff --git a/Makefile b/Makefile index 2ae6c7c3..4fa8cc04 100644 --- a/Makefile +++ b/Makefile @@ -76,6 +76,10 @@ install: clean-build clean-pyc ## install the package to the active Python's sit install-examples: clean-build clean-pyc ## install the package and the examples dependencies pip install .[examples] +.PHONY: install-unit +install-unit: clean-build clean-pyc ## install the package and dependencies for unit tests + pip install .[unit] + .PHONY: install-test install-test: clean-build clean-pyc ## install the package and test dependencies pip install .[test] diff --git a/README.md b/README.md index 4da013b0..13c23c3a 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ Features include: ## Requirements -**MLBlocks** has been developed and tested on [Python 3.6, 3.7 and 3.8](https://www.python.org/downloads/) +**MLBlocks** has been developed and tested on [Python 3.6, 3.7, 3.8, 3.9, and 3.10](https://www.python.org/downloads/) ## Install with `pip` diff --git a/setup.py b/setup.py index 70d599ea..17159dbb 100644 --- a/setup.py +++ b/setup.py @@ -96,6 +96,8 @@ 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', ], description='Pipelines and primitives for machine learning and data science.', extras_require={ @@ -113,7 +115,7 @@ long_description_content_type='text/markdown', name='mlblocks', packages=find_packages(include=['mlblocks', 'mlblocks.*']), - python_requires='>=3.6,<3.9', + python_requires='>=3.6,<3.11', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, diff --git a/tox.ini b/tox.ini index e38f071b..a589526a 100644 --- a/tox.ini +++ b/tox.ini @@ -1,8 +1,10 @@ [tox] -envlist = py3{6,7,8}, test-devel +envlist = py3{6,7,8,9,10}, test-devel [travis] python = + 3.10: py10 + 3.9: py39 3.8: py38, test-devel 3.7: py37 3.6: py36 From b85983d956699c5863e153816543fb6f29bdb8ff Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Fri, 14 Apr 2023 14:28:08 -0400 Subject: [PATCH 146/160] =?UTF-8?q?Bump=20version:=200.5.1.dev0=20?= =?UTF-8?q?=E2=86=92=200.5.1.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 3e7aa671..3b880bb8 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.5.1.dev0' +__version__ = '0.5.1.dev1' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index 70204a8c..40f0d06a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.5.1.dev0 +current_version = 0.5.1.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 17159dbb..9ab20327 100644 --- a/setup.py +++ b/setup.py @@ -120,6 +120,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.5.1.dev0', + version='0.5.1.dev1', zip_safe=False, ) From 6597bfa501bc341e27f48e2ca357a9b61a17a854 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Fri, 14 Apr 2023 15:21:18 -0400 Subject: [PATCH 147/160] add release notes --- HISTORY.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index c183b575..f1c4209f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,11 @@ Changelog ========= +0.6.0 - 2023-04-14 +------------------ + +* Support python 3.9 and 3.10 - [Issue #141](https://github.com/MLBazaar/MLBlocks/issues/141) by @sarahmish + 0.5.0 - 2023-01-22 ------------------ From 1cc2551142cc21165a09f52063545b3edd02fed7 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Fri, 14 Apr 2023 15:21:31 -0400 Subject: [PATCH 148/160] =?UTF-8?q?Bump=20version:=200.5.1.dev1=20?= =?UTF-8?q?=E2=86=92=200.6.0.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 3b880bb8..344fd4b2 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.5.1.dev1' +__version__ = '0.6.0.dev0' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index 40f0d06a..4637a833 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.5.1.dev1 +current_version = 0.6.0.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 9ab20327..80137119 100644 --- a/setup.py +++ b/setup.py @@ -120,6 +120,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.5.1.dev1', + version='0.6.0.dev0', zip_safe=False, ) From f934db0d36f4d4965707092209fcafdba74dc330 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Fri, 14 Apr 2023 15:21:31 -0400 Subject: [PATCH 149/160] =?UTF-8?q?Bump=20version:=200.6.0.dev0=20?= =?UTF-8?q?=E2=86=92=200.6.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 344fd4b2..650b26ca 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.6.0.dev0' +__version__ = '0.6.0' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index 4637a833..2800a7f1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.6.0.dev0 +current_version = 0.6.0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 80137119..fd8791a8 100644 --- a/setup.py +++ b/setup.py @@ -120,6 +120,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.6.0.dev0', + version='0.6.0', zip_safe=False, ) From ec8433590f8e928484f49ea0a76543caf7f117b5 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Fri, 14 Apr 2023 15:21:51 -0400 Subject: [PATCH 150/160] =?UTF-8?q?Bump=20version:=200.6.0=20=E2=86=92=200?= =?UTF-8?q?.6.1.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 650b26ca..021d9734 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.6.0' +__version__ = '0.6.1.dev0' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index 2800a7f1..40e7b099 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.6.0 +current_version = 0.6.1.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index fd8791a8..c9658a63 100644 --- a/setup.py +++ b/setup.py @@ -120,6 +120,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.6.0', + version='0.6.1.dev0', zip_safe=False, ) From 21f0df503609fe256ca9711b98fd92f4b83a522e Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Tue, 26 Sep 2023 10:43:52 -0400 Subject: [PATCH 151/160] Add python 3.11 to MLBlocks (#143) * test python 3.11 * pin lightfm * update pip * fix syntax * add wheel * fix data loading * fix readme example * remove data --- .github/workflows/tests.yml | 18 ++++++- README.md | 12 +++-- docs/getting_started/quickstart.rst | 14 +++-- .../tutorials/1. Using and MLPipeline.ipynb | 23 +++++--- .... Setting MLPipeline Hyperparameters.ipynb | 24 +++++++-- .../4. Saving and Loading a Pipeline.ipynb | 19 +++++-- ...ial execution and pipeline debugging.ipynb | 19 +++++-- .../6. Flexible outputs specification.ipynb | 30 ++++++++--- examples/tutorials/7. Tuning a Pipeline.ipynb | 4 +- ...or the best pipeline with BTBSession.ipynb | 20 +++---- examples/tutorials/utils.py | 52 +++++++++++++++++++ setup.py | 3 +- tox.ini | 3 +- 13 files changed, 187 insertions(+), 54 deletions(-) create mode 100644 examples/tutorials/utils.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3f46f728..0eb00220 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -19,6 +19,10 @@ jobs: uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} + - name: Upgrade pip + run: pip install -U pip setuptools wheel + - name: Install lightfm + run: python -m pip install --no-use-pep517 'lightfm<2' - name: Install package run: pip install .[dev] - name: make test-devel @@ -36,6 +40,10 @@ jobs: uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} + - name: Upgrade pip + run: pip install -U pip setuptools wheel + - name: Install lightfm + run: python -m pip install --no-use-pep517 'lightfm<2' - name: Install package and dependencies run: pip install rundoc .[mlprimitives] - name: make test-readme @@ -45,7 +53,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ['3.6', '3.7', '3.8', '3.9', '3.10'] + python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11'] os: [ubuntu-20.04, macos-latest] steps: - uses: actions/checkout@v1 @@ -70,6 +78,10 @@ jobs: uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} + - name: Upgrade pip + run: pip install -U pip setuptools wheel + - name: Install lightfm + run: python -m pip install --no-use-pep517 'lightfm<2' - name: Install package and dependencies run: pip install .[test] - name: make test-mlprimitives @@ -90,6 +102,10 @@ jobs: - if: matrix.os == 'ubuntu-20.04' name: Install dependencies - Ubuntu run: sudo apt-get install graphviz + - name: Upgrade pip + run: pip install -U pip setuptools wheel + - name: Install lightfm + run: python -m pip install --no-use-pep517 'lightfm<2' - name: Install package and dependencies run: pip install .[examples] - name: make test-tutorials diff --git a/README.md b/README.md index 13c23c3a..662a3ed3 100644 --- a/README.md +++ b/README.md @@ -86,11 +86,15 @@ pipeline which combines primitives from [MLPrimitives](https://github.com/MLBaza [scikit-learn](https://scikit-learn.org/) and [xgboost](https://xgboost.readthedocs.io/). ```python3 +import pandas as pd from mlblocks import MLPipeline -from mlprimitives.datasets import load_dataset +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score -dataset = load_dataset('census') -X_train, X_test, y_train, y_test = dataset.get_splits(1) +dataset = pd.read_csv('http://mlblocks.s3.amazonaws.com/census.csv') +label = dataset.pop('label') + +X_train, X_test, y_train, y_test = train_test_split(dataset, label, stratify=label) primitives = [ 'mlprimitives.custom.preprocessing.ClassEncoder', @@ -104,7 +108,7 @@ pipeline = MLPipeline(primitives) pipeline.fit(X_train, y_train) predictions = pipeline.predict(X_test) -dataset.score(y_test, predictions) +accuracy_score(y_test, predictions) ``` # What's Next? diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst index f0cb9a3f..55c20d86 100644 --- a/docs/getting_started/quickstart.rst +++ b/docs/getting_started/quickstart.rst @@ -104,9 +104,13 @@ labels. .. ipython:: python :okwarning: - from mlprimitives.datasets import load_census - dataset = load_census() - X_train, X_test, y_train, y_test = dataset.get_splits(1) + import pandas as pd + from sklearn.model_selection import train_test_split + + dataset = pd.read_csv('http://mlblocks.s3.amazonaws.com/census.csv') + label = dataset.pop('label') + + X_train, X_test, y_train, y_test = train_test_split(dataset, label, stratify=label) pipeline.fit(X_train, y_train) Once we have fitted our model to our data, we can call the ``predict`` method passing new data @@ -115,9 +119,11 @@ to obtain predictions from the pipeline. .. ipython:: python :okwarning: + from sklearn.metrics import accuracy_score + predictions = pipeline.predict(X_test) predictions - dataset.score(y_test, predictions) + accuracy_score(y_test, predictions) .. _you have already installed them: install.html#additional-dependencies .. _MLPipeline class: ../api_reference.html#mlblocks.MLPipeline diff --git a/examples/tutorials/1. Using and MLPipeline.ipynb b/examples/tutorials/1. Using and MLPipeline.ipynb index dab130ea..901cc50b 100644 --- a/examples/tutorials/1. Using and MLPipeline.ipynb +++ b/examples/tutorials/1. Using and MLPipeline.ipynb @@ -33,9 +33,9 @@ "metadata": {}, "outputs": [], "source": [ - "from mlprimitives.datasets import load_dataset\n", + "from utils import load_census\n", "\n", - "dataset = load_dataset('census')" + "dataset = load_census()" ] }, { @@ -528,7 +528,16 @@ "cell_type": "code", "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + } + ], "source": [ "pipeline.fit(X_train, y_train)" ] @@ -546,9 +555,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "predictions = pipeline.predict(X_test)" @@ -611,7 +618,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -625,7 +632,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.8.16" } }, "nbformat": 4, diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb index 4993fd4e..7aa0ab2b 100644 --- a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb +++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb @@ -37,9 +37,9 @@ "metadata": {}, "outputs": [], "source": [ - "from mlprimitives.datasets import load_dataset\n", + "from utils import load_census\n", "\n", - "dataset = load_dataset('census')\n", + "dataset = load_census()\n", "X_train, X_test, y_train, y_test = dataset.get_splits(1)" ] }, @@ -268,6 +268,14 @@ "execution_count": 7, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + }, { "data": { "text/plain": [ @@ -394,6 +402,14 @@ "execution_count": 11, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + }, { "data": { "text/plain": [ @@ -415,7 +431,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -429,7 +445,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.8.16" } }, "nbformat": 4, diff --git a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb index 01a58cd5..ec1c6f97 100644 --- a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb +++ b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb @@ -35,9 +35,9 @@ "metadata": {}, "outputs": [], "source": [ - "from mlprimitives.datasets import load_dataset\n", + "from utils import load_census\n", "\n", - "dataset = load_dataset('census')" + "dataset = load_census()" ] }, { @@ -71,7 +71,16 @@ "cell_type": "code", "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + } + ], "source": [ "pipeline.fit(X_train, y_train)" ] @@ -166,7 +175,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -180,7 +189,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.8.16" } }, "nbformat": 4, diff --git a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb index 57b2b43c..769a69c1 100644 --- a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb +++ b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb @@ -36,9 +36,9 @@ "metadata": {}, "outputs": [], "source": [ - "from mlprimitives.datasets import load_dataset\n", + "from utils import load_census\n", "\n", - "dataset = load_dataset('census')" + "dataset = load_census()" ] }, { @@ -430,7 +430,16 @@ "cell_type": "code", "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + } + ], "source": [ "fit_context = pipeline.fit(start_=1, output_=2, **fit_context)" ] @@ -690,7 +699,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -704,7 +713,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.8.16" } }, "nbformat": 4, diff --git a/examples/tutorials/6. Flexible outputs specification.ipynb b/examples/tutorials/6. Flexible outputs specification.ipynb index ca1048dd..6ecad5a5 100644 --- a/examples/tutorials/6. Flexible outputs specification.ipynb +++ b/examples/tutorials/6. Flexible outputs specification.ipynb @@ -37,9 +37,9 @@ "metadata": {}, "outputs": [], "source": [ - "from mlprimitives.datasets import load_dataset\n", + "from utils import load_census\n", "\n", - "dataset = load_dataset('census')" + "dataset = load_census()" ] }, { @@ -420,7 +420,16 @@ "cell_type": "code", "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + } + ], "source": [ "output_spec = [\n", " 'sklearn.impute.SimpleImputer#1.X',\n", @@ -441,7 +450,16 @@ "cell_type": "code", "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + } + ], "source": [ "output_spec = [\n", " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1.X',\n", @@ -495,7 +513,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -509,7 +527,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.8.16" } }, "nbformat": 4, diff --git a/examples/tutorials/7. Tuning a Pipeline.ipynb b/examples/tutorials/7. Tuning a Pipeline.ipynb index ca30df17..7a288a46 100644 --- a/examples/tutorials/7. Tuning a Pipeline.ipynb +++ b/examples/tutorials/7. Tuning a Pipeline.ipynb @@ -34,9 +34,9 @@ "metadata": {}, "outputs": [], "source": [ - "from mlprimitives.datasets import load_dataset\n", + "from utils import load_census\n", "\n", - "dataset = load_dataset('census')" + "dataset = load_census()" ] }, { diff --git a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb index 44431d4f..80ad93fb 100644 --- a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb +++ b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb @@ -37,9 +37,9 @@ "metadata": {}, "outputs": [], "source": [ - "from mlprimitives.datasets import load_dataset\n", + "from utils import load_census\n", "\n", - "dataset = load_dataset('census')" + "dataset = load_census()" ] }, { @@ -309,9 +309,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -536,9 +534,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -707,9 +703,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -772,7 +766,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -786,7 +780,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.8.16" } }, "nbformat": 4, diff --git a/examples/tutorials/utils.py b/examples/tutorials/utils.py new file mode 100644 index 00000000..32b210a7 --- /dev/null +++ b/examples/tutorials/utils.py @@ -0,0 +1,52 @@ +import io +import os + +import pandas as pd +from sklearn.metrics import accuracy_score +from mlprimitives.datasets import Dataset + +DATA_PATH = os.path.join( + os.path.dirname(__file__), + 'data' +) + +DATA_URL = 'http://mlblocks.s3.amazonaws.com/%7B%7D.csv' + +def _download(dataset_name, dataset_path): + url = DATA_URL.format(dataset_name) + + data = pd.read_csv(url) + data.to_csv(dataset_path, index=False) + +def _load(dataset_name): + if not os.path.exists(DATA_PATH): + os.makedirs(DATA_PATH) + + dataset_path = os.path.join(DATA_PATH, dataset_name + '.csv') + if not os.path.exists(dataset_path): + _download(dataset_name, dataset_path) + + return dataset_path + +def load_census(): + """Adult Census dataset. + + Predict whether income exceeds $50K/yr based on census data. Also known as "Adult" dataset. + + Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean + records was extracted using the following conditions: ((AAGE>16) && (AGI>100) && + (AFNLWGT>1)&& (HRSWK>0)) + + Prediction task is to determine whether a person makes over 50K a year. + + source: "UCI + sourceURI: "https://archive.ics.uci.edu/ml/datasets/census+income" + """ + + dataset_path = _load('census_train') + + X = pd.read_csv(dataset_path) + y = X.pop('label').values + + return Dataset(load_census.__doc__, X, y, accuracy_score, 'single_table', + 'classification', 'binary', stratify=True) \ No newline at end of file diff --git a/setup.py b/setup.py index c9658a63..3df32765 100644 --- a/setup.py +++ b/setup.py @@ -98,6 +98,7 @@ 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', ], description='Pipelines and primitives for machine learning and data science.', extras_require={ @@ -115,7 +116,7 @@ long_description_content_type='text/markdown', name='mlblocks', packages=find_packages(include=['mlblocks', 'mlblocks.*']), - python_requires='>=3.6,<3.11', + python_requires='>=3.6,<3.12', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, diff --git a/tox.ini b/tox.ini index a589526a..27e499ed 100644 --- a/tox.ini +++ b/tox.ini @@ -1,8 +1,9 @@ [tox] -envlist = py3{6,7,8,9,10}, test-devel +envlist = py3{6,7,8,9,10,11}, test-devel [travis] python = + 3.11: py11 3.10: py10 3.9: py39 3.8: py38, test-devel From d401d1026dec4c60a4daed19d97daee58f5b573c Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Tue, 26 Sep 2023 11:25:02 -0400 Subject: [PATCH 152/160] =?UTF-8?q?Bump=20version:=200.6.1.dev0=20?= =?UTF-8?q?=E2=86=92=200.6.1.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 021d9734..86777d40 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.6.1.dev0' +__version__ = '0.6.1.dev1' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index 40e7b099..33532996 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.6.1.dev0 +current_version = 0.6.1.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 3df32765..3575b6d0 100644 --- a/setup.py +++ b/setup.py @@ -121,6 +121,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.6.1.dev0', + version='0.6.1.dev1', zip_safe=False, ) From 76a0b5767006aad76ccf8761c3c4d6f3bf0c642a Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Tue, 26 Sep 2023 13:41:41 -0400 Subject: [PATCH 153/160] add release notes --- HISTORY.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index f1c4209f..1fcf520f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,11 @@ Changelog ========= +0.6.1 - 2023-09-26 +------------------ + +* Add python 3.11 to MLBlocks - [Issue #143](https://github.com/MLBazaar/MLBlocks/issues/143) by @sarahmish + 0.6.0 - 2023-04-14 ------------------ From 4d8c9d5742f4b3901eb4d49aa8c6b66756ccc6a4 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Tue, 26 Sep 2023 13:41:51 -0400 Subject: [PATCH 154/160] =?UTF-8?q?Bump=20version:=200.6.1.dev1=20?= =?UTF-8?q?=E2=86=92=200.6.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 86777d40..4646fd8b 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.6.1.dev1' +__version__ = '0.6.1' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index 33532996..e02d1a91 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.6.1.dev1 +current_version = 0.6.1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 3575b6d0..4b211e2b 100644 --- a/setup.py +++ b/setup.py @@ -121,6 +121,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.6.1.dev1', + version='0.6.1', zip_safe=False, ) From 1658ee0552e678e6b6c04c394e22d8e60a8e7112 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Tue, 26 Sep 2023 13:42:11 -0400 Subject: [PATCH 155/160] =?UTF-8?q?Bump=20version:=200.6.1=20=E2=86=92=200?= =?UTF-8?q?.6.2.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 4646fd8b..f42e9f83 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.6.1' +__version__ = '0.6.2.dev0' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index e02d1a91..d582e738 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.6.1 +current_version = 0.6.2.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 4b211e2b..c741eadc 100644 --- a/setup.py +++ b/setup.py @@ -121,6 +121,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.6.1', + version='0.6.2.dev0', zip_safe=False, ) From cf3bd258842864b9f8996dd7e1e0e735d635eb5c Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Sun, 17 Nov 2024 12:03:59 -0500 Subject: [PATCH 156/160] Upgrade python version to include 3.12 and 3.13 (#144) * update python * update dependencies * mute simpleimputer for now * edit docs * change image * restore tutorials * fix devel tests * change btb to baytune * fix python specification * cap copulas at 0.11 * update readme --- .github/workflows/tests.yml | 12 +- README.md | 2 +- examples/tutorials/7. Tuning a Pipeline.ipynb | 31 ++- ...or the best pipeline with BTBSession.ipynb | 241 ++++++------------ setup.py | 32 ++- tox.ini | 8 +- 6 files changed, 126 insertions(+), 200 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0eb00220..cbadf809 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,7 +11,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.8] + python-version: ['3.10'] os: [ubuntu-latest] steps: - uses: actions/checkout@v1 @@ -20,7 +20,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Upgrade pip - run: pip install -U pip setuptools wheel + run: pip install -U "pip<=24.1" setuptools wheel - name: Install lightfm run: python -m pip install --no-use-pep517 'lightfm<2' - name: Install package @@ -32,7 +32,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: ['3.8', '3.9', '3.10', '3.11'] os: [ubuntu-20.04, macos-latest] steps: - uses: actions/checkout@v1 @@ -53,7 +53,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] os: [ubuntu-20.04, macos-latest] steps: - uses: actions/checkout@v1 @@ -70,7 +70,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: ['3.8', '3.9', '3.10', '3.11'] os: [ubuntu-20.04, macos-latest] steps: - uses: actions/checkout@v1 @@ -91,7 +91,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: ['3.8', '3.9', '3.10', '3.11'] os: [ubuntu-20.04] steps: - uses: actions/checkout@v1 diff --git a/README.md b/README.md index 662a3ed3..fb5ba341 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ Features include: ## Requirements -**MLBlocks** has been developed and tested on [Python 3.6, 3.7, 3.8, 3.9, and 3.10](https://www.python.org/downloads/) +**MLBlocks** has been developed and tested on [Python 3.8, 3.9, 3.10, 3.11, 3.12, 3.13](https://www.python.org/downloads/) ## Install with `pip` diff --git a/examples/tutorials/7. Tuning a Pipeline.ipynb b/examples/tutorials/7. Tuning a Pipeline.ipynb index 7a288a46..484e0b22 100644 --- a/examples/tutorials/7. Tuning a Pipeline.ipynb +++ b/examples/tutorials/7. Tuning a Pipeline.ipynb @@ -181,7 +181,7 @@ { "data": { "text/plain": [ - "0.8639171383183359" + "0.863978563379761" ] }, "execution_count": 6, @@ -210,7 +210,7 @@ { "data": { "text/plain": [ - "0.8686773872402614" + "0.868554574842" ] }, "execution_count": 7, @@ -242,7 +242,7 @@ "metadata": {}, "outputs": [], "source": [ - "from btb.tuning import Tunable\n", + "from baytune.tuning import Tunable\n", "\n", "tunable = Tunable.from_dict(tunable_hyperparameters)" ] @@ -265,7 +265,7 @@ "metadata": {}, "outputs": [], "source": [ - "from btb.tuning import GPTuner\n", + "from baytune.tuning import GPTuner\n", "\n", "tuner = GPTuner(tunable)" ] @@ -345,16 +345,15 @@ "output_type": "stream", "text": [ "scoring pipeline 1\n", + "New best found: 0.871994161365419\n", "scoring pipeline 2\n", + "New best found: 0.8723319756253888\n", "scoring pipeline 3\n", "scoring pipeline 4\n", - "New best found: 0.8642241881762839\n", "scoring pipeline 5\n", "scoring pipeline 6\n", "scoring pipeline 7\n", - "New best found: 0.8644390957265209\n", "scoring pipeline 8\n", - "New best found: 0.8679095503945804\n", "scoring pipeline 9\n", "scoring pipeline 10\n" ] @@ -395,13 +394,13 @@ "data": { "text/plain": [ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 39,\n", - " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'most_frequent',\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 70,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 6,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.07406443671152008,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.9244108160038952,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" + " 'max_labels'): 60,\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 190,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.13575511242790694,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.6326488945712287,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 8}" ] }, "execution_count": 13, @@ -443,7 +442,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -457,7 +456,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb index 80ad93fb..a7e9d69a 100644 --- a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb +++ b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb @@ -157,7 +157,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 6, @@ -286,7 +286,7 @@ "metadata": {}, "outputs": [], "source": [ - "from btb.session import BTBSession\n", + "from baytune.session import BTBSession\n", "\n", "session = BTBSession(tunables, cross_validate, verbose=True)" ] @@ -314,12 +314,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "342fe40f08024adcb5b60eea25f49d37", + "model_id": "00c20e4b982f42a1873c0d12f550ee4b", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))" + " 0%| | 0/5 [00:00\", line 11, in cross_validate\n", + " File \"/var/folders/by/d1f3gk0x14v54qggfxmjbn1c0000gn/T/ipykernel_19852/2674531477.py\", line 11, in cross_validate\n", " pipeline.fit(X_train, y_train)\n", - " File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlpipeline.py\", line 754, in fit\n", - " block, block_name, context, output_variables, outputs, debug_info)\n", - " File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlpipeline.py\", line 645, in _produce_block\n", + " File \"/Users/sarah/Documents/git-repos/MLBlocks/mlblocks/mlpipeline.py\", line 805, in fit\n", + " self._produce_block(\n", + " File \"/Users/sarah/Documents/git-repos/MLBlocks/mlblocks/mlpipeline.py\", line 679, in _produce_block\n", " block_outputs = block.produce(**produce_args)\n", - " File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlblock.py\", line 322, in produce\n", + " File \"/Users/sarah/Documents/git-repos/MLBlocks/mlblocks/mlblock.py\", line 331, in produce\n", " return getattr(self.instance, self.produce_method)(**produce_kwargs)\n", - " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/mlprimitives/custom/text.py\", line 111, in produce\n", + " File \"/Users/sarah/Documents/git-repos/MLPrimitives/mlprimitives/custom/text.py\", line 111, in produce\n", " texts = X[self.column]\n", - " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n", + " File \"/opt/anaconda3/envs/py10/lib/python3.10/site-packages/pandas/core/frame.py\", line 3807, in __getitem__\n", " indexer = self.columns.get_loc(key)\n", - " File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n", - " return self._engine.get_loc(self._maybe_cast_indexer(key))\n", - " File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n", - " File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n", - " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", - " File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n", + " File \"/opt/anaconda3/envs/py10/lib/python3.10/site-packages/pandas/core/indexes/base.py\", line 3804, in get_loc\n", + " raise KeyError(key) from err\n", "KeyError: 'text'\n", - "2020-09-16 16:32:46,587 - WARNING - btb.session - Too many errors: 1. Removing tunable single_table.classification.text\n", - "2020-09-16 16:32:46,589 - INFO - btb.session - Creating Tunable instance from dict.\n", - "2020-09-16 16:32:46,589 - INFO - btb.session - Obtaining default configuration for single_table.classification.xgb\n", - "2020-09-16 16:32:52,100 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n", - "2020-09-16 16:33:28,900 - INFO - btb.session - New optimal found: single_table.classification - 0.8728234138413778\n", - "2020-09-16 16:33:28,904 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" + "Too many errors: 1. Removing tunable single_table.classification.text\n" ] }, { "data": { "text/plain": [ - "{'id': '7e662f9b90f0e123939b7532ecc221c7',\n", - " 'name': 'single_table.classification',\n", + "{'id': '0ebe8af9c06a05f39821de36d6c9ffc2',\n", + " 'name': 'single_table.classification.xgb',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 63,\n", - " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", - " 'lowercase'): True,\n", - " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", - " 'binary'): True,\n", - " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", - " 'max_features'): 7315,\n", + " 'max_labels'): 52,\n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'median',\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 879,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23231879890615814,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.5474914147721585,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 3},\n", - " 'score': 0.8728234138413778}" + " ('xgboost.XGBClassifier#1', 'n_estimators'): 313,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.7119589664956909,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.944854007471167,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n", + " 'score': 0.8641320270062784}" ] }, "execution_count": 11, @@ -489,23 +458,17 @@ { "data": { "text/plain": [ - "{'id': '7e662f9b90f0e123939b7532ecc221c7',\n", - " 'name': 'single_table.classification',\n", + "{'id': '0ebe8af9c06a05f39821de36d6c9ffc2',\n", + " 'name': 'single_table.classification.xgb',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 63,\n", - " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", - " 'lowercase'): True,\n", - " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", - " 'binary'): True,\n", - " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n", - " 'max_features'): 7315,\n", + " 'max_labels'): 52,\n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'median',\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 879,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23231879890615814,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.5474914147721585,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 3},\n", - " 'score': 0.8728234138413778}" + " ('xgboost.XGBClassifier#1', 'n_estimators'): 313,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.7119589664956909,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.944854007471167,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n", + " 'score': 0.8641320270062784}" ] }, "execution_count": 12, @@ -539,71 +502,31 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "8dd5d4626f304c279b2b368a671b6cb7", + "model_id": "a0dbe69a0340455a937f7376f7723ec4", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))" + " 0%| | 0/10 [00:00=0.9,<1', - 'numpy>=1.17.1,<2', - 'psutil>=5,<6', + 'numpy>=1.17.1,<3', + 'psutil>=5,<7', ] mlprimitives_requires = [ - 'mlprimitives>=0.3.0,<0.4', - 'h5py<2.11.0,>=2.10.0', # <- tensorflow 2.3.2 conflict - 'matplotlib<3.2.2,>=2.2.2', # <- copulas 0.3.3 + 'mlprimitives>=0.4.0,<0.5', + 'h5py<4,>=2.10.0', # <- tensorflow 2.3.2 conflict + 'matplotlib<4,>=2.2.2', # <- copulas 0.3.3 'protobuf<4', # <- importlib ] examples_require = mlprimitives_requires + [ 'jupyter==1.0.0', - 'baytune>=0.4.0,<0.5', + 'baytune>=0.5.0,<0.6', + 'copulas<0.12', ] @@ -50,7 +51,7 @@ # general 'bumpversion>=0.5.3,<0.6', 'pip>=9.0.1', - 'watchdog>=0.8.3,<0.11', + 'watchdog>=0.8.3,<5', # docs 'm2r>=0.2.0,<0.3', @@ -62,6 +63,15 @@ 'Jinja2>=2,<3', # >=3 makes sphinx theme fail 'markupsafe<2.1.0', + # fails on Sphinx < v3.4 + 'alabaster<=0.7.12', + # fails on Sphins < v5.0 + 'sphinxcontrib-applehelp<1.0.8', + 'sphinxcontrib-devhelp<1.0.6', + 'sphinxcontrib-htmlhelp<2.0.5', + 'sphinxcontrib-serializinghtml<1.1.10', + 'sphinxcontrib-qthelp<1.0.7', + # style check 'flake8>=3.7.7,<4', 'isort>=4.3.4,<5', @@ -93,12 +103,12 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'Programming Language :: Python :: 3.13', ], description='Pipelines and primitives for machine learning and data science.', extras_require={ @@ -116,7 +126,7 @@ long_description_content_type='text/markdown', name='mlblocks', packages=find_packages(include=['mlblocks', 'mlblocks.*']), - python_requires='>=3.6,<3.12', + python_requires='>=3.8,<3.14', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, diff --git a/tox.ini b/tox.ini index 27e499ed..cdaadc29 100644 --- a/tox.ini +++ b/tox.ini @@ -3,12 +3,12 @@ envlist = py3{6,7,8,9,10,11}, test-devel [travis] python = - 3.11: py11 - 3.10: py10 + 3.13: py313 + 3.12: py312 + 3.11: py311 + 3.10: py310 3.9: py39 3.8: py38, test-devel - 3.7: py37 - 3.6: py36 [testenv] passenv = CI TRAVIS TRAVIS_* From a38b46a1fa4ae19998437759e34da9941f2066f6 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Mon, 18 Nov 2024 11:53:06 -0500 Subject: [PATCH 157/160] =?UTF-8?q?Bump=20version:=200.6.2.dev0=20?= =?UTF-8?q?=E2=86=92=200.6.2.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index f42e9f83..0fbf2c2f 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.6.2.dev0' +__version__ = '0.6.2.dev1' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index d582e738..9be18137 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.6.2.dev0 +current_version = 0.6.2.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index cc59e712..0ff336d1 100644 --- a/setup.py +++ b/setup.py @@ -131,6 +131,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.6.2.dev0', + version='0.6.2.dev1', zip_safe=False, ) From 1b3ffbce7379d20548501a472cd2c8331d67e1e2 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Mon, 18 Nov 2024 13:45:18 -0500 Subject: [PATCH 158/160] add release notes --- HISTORY.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 1fcf520f..97c363f3 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,11 @@ Changelog ========= +0.6.2 - 2024-11-18 +------------------ + +* Upgrade python version to include 3.12 and 3.13 - [Issue #144](https://github.com/MLBazaar/MLBlocks/issues/144) by @sarahmish + 0.6.1 - 2023-09-26 ------------------ From 1406d3783b2bdd4f36f6243590207fdf1b6f668a Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Mon, 18 Nov 2024 13:45:22 -0500 Subject: [PATCH 159/160] =?UTF-8?q?Bump=20version:=200.6.2.dev1=20?= =?UTF-8?q?=E2=86=92=200.6.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 0fbf2c2f..22734701 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.6.2.dev1' +__version__ = '0.6.2' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index 9be18137..dfc0a44b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.6.2.dev1 +current_version = 0.6.2 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 0ff336d1..ee4f2884 100644 --- a/setup.py +++ b/setup.py @@ -131,6 +131,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.6.2.dev1', + version='0.6.2', zip_safe=False, ) From db5ff4b925358ef568492b45058dddded05be873 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Mon, 18 Nov 2024 13:45:35 -0500 Subject: [PATCH 160/160] =?UTF-8?q?Bump=20version:=200.6.2=20=E2=86=92=200?= =?UTF-8?q?.6.3.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mlblocks/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py index 22734701..fa7130da 100644 --- a/mlblocks/__init__.py +++ b/mlblocks/__init__.py @@ -20,7 +20,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.6.2' +__version__ = '0.6.3.dev0' __all__ = [ 'MLBlock', diff --git a/setup.cfg b/setup.cfg index dfc0a44b..8908f680 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.6.2 +current_version = 0.6.3.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index ee4f2884..e4ab47c9 100644 --- a/setup.py +++ b/setup.py @@ -131,6 +131,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/MLBazaar/MLBlocks', - version='0.6.2', + version='0.6.3.dev0', zip_safe=False, )