From 396114b6140a12ead0d52993c8e42d61d2e4dd13 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 7 Jan 2019 20:13:41 +0100
Subject: [PATCH 001/160] Discover primitives using entry_points

---
 mlblocks/primitives.py | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py
index 337116e7..8902b672 100644
--- a/mlblocks/primitives.py
+++ b/mlblocks/primitives.py
@@ -9,11 +9,15 @@
 
 import json
 import os
+import pkg_resources
 import sys
 
+_PRIMITIVES_FOLDER_NAME = 'mlprimitives'
+_OLD_PRIMITIVES_FOLDER_NAME = 'mlblocks_primitives'
 _PRIMITIVES_PATHS = [
-    os.path.join(os.getcwd(), 'mlblocks_primitives'),
-    os.path.join(sys.prefix, 'mlblocks_primitives'),
+    os.path.join(os.getcwd(), _PRIMITIVES_FOLDER_NAME),
+    os.path.join(os.getcwd(), _OLD_PRIMITIVES_FOLDER_NAME),    # legacy
+    os.path.join(sys.prefix, _OLD_PRIMITIVES_FOLDER_NAME),    # legacy
 ]
 
 
@@ -45,7 +49,13 @@ def get_primitives_paths():
         list:
             The list of folders.
     """
-    return _PRIMITIVES_PATHS
+
+    primitives_paths = list()
+    for entry_point in pkg_resources.iter_entry_points(_PRIMITIVES_FOLDER_NAME):
+        path = pkg_resources.resource_filename(entry_point.name, entry_point.module_name)
+        primitives_paths.append(path)
+
+    return _PRIMITIVES_PATHS + primitives_paths
 
 
 def load_primitive(name):
@@ -69,10 +79,17 @@ def load_primitive(name):
                     found.
     """
 
-    for base_path in _PRIMITIVES_PATHS:
-        json_path = os.path.join(base_path, name + '.json')
-        if os.path.isfile(json_path):
-            with open(json_path, 'r') as json_file:
-                return json.load(json_file)
+    for base_path in get_primitives_paths():
+        parts = name.split('.')
+        number_of_parts = len(parts)
+
+        for folder_parts in range(number_of_parts):
+            folder = os.path.join(base_path, *parts[:folder_parts])
+            filename = '.'.join(parts[folder_parts:]) + '.json'
+            json_path = os.path.join(folder, filename)
+
+            if os.path.isfile(json_path):
+                with open(json_path, 'r') as json_file:
+                    return json.load(json_file)
 
     raise ValueError("Unknown primitive: {}".format(name))

From da04277d5268b194bd33707735a14b79d1cf1239 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 8 Jan 2019 14:36:27 +0100
Subject: [PATCH 002/160] Fix import order and add tests

---
 mlblocks/primitives.py   |  6 ++++--
 tests/test_primitives.py | 30 +++++++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py
index 8902b672..d4825bf6 100644
--- a/mlblocks/primitives.py
+++ b/mlblocks/primitives.py
@@ -9,9 +9,10 @@
 
 import json
 import os
-import pkg_resources
 import sys
 
+import pkg_resources
+
 _PRIMITIVES_FOLDER_NAME = 'mlprimitives'
 _OLD_PRIMITIVES_FOLDER_NAME = 'mlblocks_primitives'
 _PRIMITIVES_PATHS = [
@@ -52,7 +53,8 @@ def get_primitives_paths():
 
     primitives_paths = list()
     for entry_point in pkg_resources.iter_entry_points(_PRIMITIVES_FOLDER_NAME):
-        path = pkg_resources.resource_filename(entry_point.name, entry_point.module_name)
+        module_path = os.path.join(*entry_point.module_name.split('.'))
+        path = pkg_resources.resource_filename(entry_point.name, module_path)
         primitives_paths.append(path)
 
     return _PRIMITIVES_PATHS + primitives_paths
diff --git a/tests/test_primitives.py b/tests/test_primitives.py
index 65906406..990c4da5 100644
--- a/tests/test_primitives.py
+++ b/tests/test_primitives.py
@@ -7,6 +7,7 @@
 from unittest.mock import patch
 
 import pytest
+from pkg_resources import EntryPoint
 
 from mlblocks import primitives
 
@@ -36,12 +37,39 @@ def test_add_primitives_path():
 
 
 @patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
-def test_get_primitives_paths():
+@patch('mlblocks.primitives._PRIMITIVES_FOLDER_NAME', new='fake_name')
+def test_get_primitives_paths_no_entry_points():
     paths = primitives.get_primitives_paths()
 
     assert paths == ['a', 'b']
 
 
+@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
+@patch('mlblocks.primitives.pkg_resources.iter_entry_points')
+def test_get_primitives_paths_entry_points(iep_mock):
+    # setup
+    iep_mock.return_value = [
+        EntryPoint('mlblocks', 'primitives.jsons')
+    ]
+
+    # run
+    paths = primitives.get_primitives_paths()
+
+    # assert
+    expected = [
+        'a',
+        'b',
+        os.path.join(
+            os.path.dirname(primitives.__file__),
+            'primitives',
+            'jsons'
+        )
+    ]
+    assert paths == expected
+
+    iep_mock.assert_called_once_with('mlprimitives')
+
+
 @patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
 def test_load_primitive_value_error():
     with pytest.raises(ValueError):

From f551d339217554472fb5cecc162d5ab31f0d10d6 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 8 Jan 2019 19:02:43 +0100
Subject: [PATCH 003/160] Change slightly the way the entry points are used and
 add docs

---
 .gitignore                                |  1 +
 docs/advanced_usage/adding_primitives.rst | 34 ++++++++-
 docs/api/mlblocks.primitives.rst          |  5 ++
 docs/index.rst                            |  1 +
 docs/pipeline.json                        | 91 -----------------------
 mlblocks/primitives.py                    | 33 +++++---
 tests/__init__.py                         |  0
 tests/test_primitives.py                  | 30 +++++---
 8 files changed, 83 insertions(+), 112 deletions(-)
 create mode 100644 docs/api/mlblocks.primitives.rst
 delete mode 100644 docs/pipeline.json
 create mode 100644 tests/__init__.py

diff --git a/.gitignore b/.gitignore
index cbc1f8c1..011ff452 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,6 +64,7 @@ instance/
 
 # Sphinx documentation
 docs/_build/
+docs/pipeline.json
 
 # PyBuilder
 target/
diff --git a/docs/advanced_usage/adding_primitives.rst b/docs/advanced_usage/adding_primitives.rst
index fc2e81b9..e3d4b964 100644
--- a/docs/advanced_usage/adding_primitives.rst
+++ b/docs/advanced_usage/adding_primitives.rst
@@ -29,7 +29,7 @@ by writing the corresponding `JSON annotation <primitives.html#json-annotations>
 
 .. _MLPrimitives integrated primitives: https://github.com/HDI-Project/MLPrimitives/tree/master/mlblocks_primitives
 
-.. note:: If you integrate new primitives for MLBlocks, please consider contributing them to the
+.. note:: If you create new primitives for MLBlocks, please consider contributing them to the
           **MLPrimitives** project!
 
 The first thing to do when adding a new primitive is making sure that it complies with the
@@ -58,8 +58,8 @@ place known to **MLBlocks**.
 **MLBlocks** looks for primitives in the following folders, in this order:
 
 1. Any folder specified by the user, starting by the latest one.
-2. A folder named `mlblocks_primitives` in the current working directory.
-3. A folder named `mlblocks_primitives` in the `system prefix`_.
+2. A folder named ``mlblocks_primitives`` or ``mlprimitives`` in the current working directory.
+3. A folder named ``mlblocks_primitives`` or ``mlprimitives`` in the `system prefix`_.
 
 .. _system prefix: https://docs.python.org/3/library/sys.html#sys.prefix
 
@@ -80,3 +80,31 @@ However, sometimes you will want to add a custom directory.
 This can be easily done by using the `mlblocks.add_primitives_path`_ method.
 
 .. _mlblocks.add_primitives_path: ../api_reference.html#mlblocks.add_primitives_path
+
+Developing a Primitives Library
+-------------------------------
+
+Another option to add multiple libraries is creating a primitives library, such as
+`MLPrimitives`_.
+
+In order to make **MLBLocks** able to find the primitives defined in such a library,
+all you need to do is setting up an `Entry Point`_ in your `setup.py` script with the
+following specification:
+
+1. It has to be published under the name ``mlprimitives``.
+2. It has to be named exactly ``jsons_path``.
+3. It has to point at a variable that contains the path to the JSONS folder.
+
+An example of such an entry point would be::
+
+    entry_points = {
+        'mlprimitives': [
+            'jsons_path=some_module:SOME_VARIABLE'
+        ]
+    }
+
+where the module `some_module` contains a variable such as::
+
+    SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
+
+.. _Entry Point: https://packaging.python.org/specifications/entry-points/
diff --git a/docs/api/mlblocks.primitives.rst b/docs/api/mlblocks.primitives.rst
new file mode 100644
index 00000000..d625c774
--- /dev/null
+++ b/docs/api/mlblocks.primitives.rst
@@ -0,0 +1,5 @@
+mlblocks.primitives
+===================
+
+.. automodule:: mlblocks.primitives
+    :members:
diff --git a/docs/index.rst b/docs/index.rst
index 28a3f0bb..2bb4c5a9 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -74,6 +74,7 @@ integrate with deep learning libraries.
 
    api/mlblocks
    api/mlblocks.datasets
+   api/mlblocks.primitives
 
 .. toctree::
    :caption: Resources
diff --git a/docs/pipeline.json b/docs/pipeline.json
deleted file mode 100644
index c09d763c..00000000
--- a/docs/pipeline.json
+++ /dev/null
@@ -1,91 +0,0 @@
-{
-    "primitives": [
-        "sklearn.preprocessing.StandardScaler",
-        "sklearn.ensemble.RandomForestClassifier"
-    ],
-    "init_params": {
-        "sklearn.preprocessing.StandardScaler": {
-            "with_mean": false
-        },
-        "sklearn.ensemble.RandomForestClassifier": {
-            "n_estimators": 100
-        }
-    },
-    "input_names": {},
-    "output_names": {},
-    "hyperparameters": {
-        "sklearn.preprocessing.StandardScaler#1": {
-            "with_mean": false,
-            "with_std": true
-        },
-        "sklearn.ensemble.RandomForestClassifier#1": {
-            "n_jobs": -1,
-            "n_estimators": 100,
-            "criterion": "entropy",
-            "max_features": null,
-            "max_depth": 10,
-            "min_samples_split": 0.1,
-            "min_samples_leaf": 0.1,
-            "class_weight": null
-        }
-    },
-    "tunable_hyperparameters": {
-        "sklearn.preprocessing.StandardScaler#1": {
-            "with_std": {
-                "type": "bool",
-                "default": true
-            }
-        },
-        "sklearn.ensemble.RandomForestClassifier#1": {
-            "criterion": {
-                "type": "str",
-                "default": "entropy",
-                "values": [
-                    "entropy",
-                    "gini"
-                ]
-            },
-            "max_features": {
-                "type": "str",
-                "default": null,
-                "range": [
-                    null,
-                    "auto",
-                    "log2"
-                ]
-            },
-            "max_depth": {
-                "type": "int",
-                "default": 10,
-                "range": [
-                    1,
-                    30
-                ]
-            },
-            "min_samples_split": {
-                "type": "float",
-                "default": 0.1,
-                "range": [
-                    0.0001,
-                    0.5
-                ]
-            },
-            "min_samples_leaf": {
-                "type": "float",
-                "default": 0.1,
-                "range": [
-                    0.0001,
-                    0.5
-                ]
-            },
-            "class_weight": {
-                "type": "str",
-                "default": null,
-                "range": [
-                    null,
-                    "balanced"
-                ]
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py
index d4825bf6..8aaaa60f 100644
--- a/mlblocks/primitives.py
+++ b/mlblocks/primitives.py
@@ -13,12 +13,11 @@
 
 import pkg_resources
 
-_PRIMITIVES_FOLDER_NAME = 'mlprimitives'
-_OLD_PRIMITIVES_FOLDER_NAME = 'mlblocks_primitives'
 _PRIMITIVES_PATHS = [
-    os.path.join(os.getcwd(), _PRIMITIVES_FOLDER_NAME),
-    os.path.join(os.getcwd(), _OLD_PRIMITIVES_FOLDER_NAME),    # legacy
-    os.path.join(sys.prefix, _OLD_PRIMITIVES_FOLDER_NAME),    # legacy
+    os.path.join(os.getcwd(), 'mlprimitives'),
+    os.path.join(sys.prefix, 'mlprimitives'),
+    os.path.join(os.getcwd(), 'mlblocks_primitives'),    # legacy
+    os.path.join(sys.prefix, 'mlblocks_primitives'),    # legacy
 ]
 
 
@@ -46,16 +45,32 @@ def add_primitives_path(path):
 def get_primitives_paths():
     """Get the list of folders where the primitives will be looked for.
 
+    This list will include the value of any `entry_point` named `jsons_path` published under
+    the name `mlprimitives`.
+
+    An example of such an entry point would be::
+
+        entry_points = {
+            'mlprimitives': [
+                'jsons_path=some_module:SOME_VARIABLE'
+            ]
+        }
+
+    where the module `some_module` contains a variable such as::
+
+        SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
+
     Returns:
         list:
             The list of folders.
     """
 
     primitives_paths = list()
-    for entry_point in pkg_resources.iter_entry_points(_PRIMITIVES_FOLDER_NAME):
-        module_path = os.path.join(*entry_point.module_name.split('.'))
-        path = pkg_resources.resource_filename(entry_point.name, module_path)
-        primitives_paths.append(path)
+    entry_points = pkg_resources.iter_entry_points('mlprimitives')
+    for entry_point in entry_points:
+        if entry_point.name == 'jsons_path':
+            path = entry_point.load()
+            primitives_paths.append(path)
 
     return _PRIMITIVES_PATHS + primitives_paths
 
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_primitives.py b/tests/test_primitives.py
index 990c4da5..1afd17b6 100644
--- a/tests/test_primitives.py
+++ b/tests/test_primitives.py
@@ -7,10 +7,12 @@
 from unittest.mock import patch
 
 import pytest
-from pkg_resources import EntryPoint
+from pkg_resources import Distribution, EntryPoint
 
 from mlblocks import primitives
 
+FAKE_MLPRIMITIVES_PATH = 'this/is/a/fake'
+
 
 @patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
 def test_add_primitives_path_do_nothing():
@@ -37,19 +39,33 @@ def test_add_primitives_path():
 
 
 @patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
-@patch('mlblocks.primitives._PRIMITIVES_FOLDER_NAME', new='fake_name')
-def test_get_primitives_paths_no_entry_points():
+@patch('mlblocks.primitives.pkg_resources.iter_entry_points')
+def test_get_primitives_paths_no_entry_points(iep_mock):
+    # setup
+    iep_mock.return_value == []
+
+    # run
     paths = primitives.get_primitives_paths()
 
+    # assert
     assert paths == ['a', 'b']
+    iep_mock.assert_called_once_with('mlprimitives')
 
 
 @patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
 @patch('mlblocks.primitives.pkg_resources.iter_entry_points')
 def test_get_primitives_paths_entry_points(iep_mock):
     # setup
+    something_else_ep = EntryPoint('something_else', 'mlblocks.__version__')
+    jsons_path_ep = EntryPoint(
+        'jsons_path',
+        'tests.test_primitives',
+        attrs=['FAKE_MLPRIMITIVES_PATH'],
+        dist=Distribution()
+    )
     iep_mock.return_value = [
-        EntryPoint('mlblocks', 'primitives.jsons')
+        something_else_ep,
+        jsons_path_ep
     ]
 
     # run
@@ -59,11 +75,7 @@ def test_get_primitives_paths_entry_points(iep_mock):
     expected = [
         'a',
         'b',
-        os.path.join(
-            os.path.dirname(primitives.__file__),
-            'primitives',
-            'jsons'
-        )
+        'this/is/a/fake'
     ]
     assert paths == expected
 

From 74da0e2249cb30100229c64cd0a83a4543daf12e Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 9 Jan 2019 16:06:47 +0100
Subject: [PATCH 004/160] Add logging statements

---
 mlblocks/__init__.py   |  6 +++---
 mlblocks/datasets.py   |  9 +++++++++
 mlblocks/mlblock.py    |  6 +++++-
 mlblocks/mlpipeline.py | 38 ++++++++++++++++++++++++--------------
 mlblocks/primitives.py |  5 +++++
 5 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index cfc0ef6a..43079986 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -10,9 +10,9 @@
 * Documentation: https://HDI-Project.github.io/MLBlocks
 """
 
-from mlblocks.mlblock import MLBlock  # noqa
-from mlblocks.mlpipeline import MLPipeline  # noqa
-from mlblocks.primitives import add_primitives_path, get_primitives_paths, load_primitive  # noqa
+from mlblocks.mlblock import MLBlock
+from mlblocks.mlpipeline import MLPipeline
+from mlblocks.primitives import add_primitives_path, get_primitives_paths, load_primitive
 
 __author__ = 'MIT Data To AI Lab'
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py
index fba968e8..b5ed6b46 100644
--- a/mlblocks/datasets.py
+++ b/mlblocks/datasets.py
@@ -40,6 +40,7 @@
 """
 
 import io
+import logging
 import os
 import tarfile
 import urllib
@@ -52,6 +53,8 @@
 from sklearn.metrics import accuracy_score, normalized_mutual_info_score, r2_score
 from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
 
+LOGGER = logging.getLogger(__name__)
+
 INPUT_SHAPE = [224, 224, 3]
 
 DATA_PATH = os.path.join(
@@ -183,9 +186,12 @@ def get_splits(self, n_splits=1):
 
 def _download(dataset_name, dataset_path):
     url = DATA_URL.format(dataset_name)
+
+    LOGGER.debug('Downloading dataset %s from %s', dataset_name, url)
     response = urllib.request.urlopen(url)
     bytes_io = io.BytesIO(response.read())
 
+    LOGGER.debug('Extracting dataset into %s', DATA_PATH)
     with tarfile.open(fileobj=bytes_io, mode='r:gz') as tf:
         tf.extractall(DATA_PATH)
 
@@ -202,6 +208,7 @@ def _load(dataset_name):
 
 
 def _load_images(image_dir, filenames):
+    LOGGER.debug('Loading %s images from %s', len(filenames), image_dir)
     images = []
     for filename in filenames:
         filename = os.path.join(image_dir, filename)
@@ -217,6 +224,8 @@ def _load_images(image_dir, filenames):
 
 def _load_csv(dataset_path, name, set_index=False):
     csv_path = os.path.join(dataset_path, name + '.csv')
+
+    LOGGER.debug('Loading csv %s', csv_path)
     df = pd.read_csv(csv_path)
 
     if set_index:
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 9b6ec0d0..04a4bf55 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -3,9 +3,12 @@
 """Package where the MLBlock class is defined."""
 
 import importlib
+import logging
 
 from mlblocks.primitives import load_primitive
 
+LOGGER = logging.getLogger(__name__)
+
 
 def import_object(object_name):
     """Import an object from its Fully Qualified Name."""
@@ -83,7 +86,7 @@ def _extract_params(self, kwargs, hyperparameters):
                 value = param['default']
 
             else:
-                raise TypeError("Required argument '{}' not found".format(name))
+                raise TypeError("{} required argument '{}' not found".format(self.name, name))
 
             init_params[name] = value
 
@@ -193,6 +196,7 @@ def set_hyperparameters(self, hyperparameters):
         self._hyperparameters.update(hyperparameters)
 
         if self._class:
+            LOGGER.debug('Creating a new primitive instance for %s', self.name)
             self.instance = self.primitive(**self._hyperparameters)
 
     def fit(self, **kwargs):
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 4bad5d1f..058737ee 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -215,19 +215,25 @@ def fit(self, X=None, y=None, **kwargs):
 
         last_block_name = list(self.blocks.keys())[-1]
         for block_name, block in self.blocks.items():
-            fit_args = self._get_block_args(block_name, block.fit_args, context)
-
             LOGGER.debug("Fitting block %s", block_name)
-            block.fit(**fit_args)
+            try:
+                fit_args = self._get_block_args(block_name, block.fit_args, context)
+                block.fit(**fit_args)
+            except Exception:
+                LOGGER.exception("Exception caught fitting MLBlock %s", block_name)
+                raise
 
             if block_name != last_block_name:
-                produce_args = self._get_block_args(block_name, block.produce_args, context)
-
                 LOGGER.debug("Producing block %s", block_name)
-                outputs = block.produce(**produce_args)
+                try:
+                    produce_args = self._get_block_args(block_name, block.produce_args, context)
+                    outputs = block.produce(**produce_args)
 
-                output_dict = self._get_outputs(block_name, outputs, block.produce_output)
-                context.update(output_dict)
+                    output_dict = self._get_outputs(block_name, outputs, block.produce_output)
+                    context.update(output_dict)
+                except Exception:
+                    LOGGER.exception("Exception caught producing MLBlock %s", block_name)
+                    raise
 
     def predict(self, X=None, **kwargs):
         """Produce predictions using the blocks of this pipeline.
@@ -252,14 +258,18 @@ def predict(self, X=None, **kwargs):
 
         last_block_name = list(self.blocks.keys())[-1]
         for block_name, block in self.blocks.items():
-            produce_args = self._get_block_args(block_name, block.produce_args, context)
-
             LOGGER.debug("Producing block %s", block_name)
-            outputs = block.produce(**produce_args)
+            try:
+                produce_args = self._get_block_args(block_name, block.produce_args, context)
+                outputs = block.produce(**produce_args)
 
-            if block_name != last_block_name:
-                output_dict = self._get_outputs(block_name, outputs, block.produce_output)
-                context.update(output_dict)
+                if block_name != last_block_name:
+                    output_dict = self._get_outputs(block_name, outputs, block.produce_output)
+                    context.update(output_dict)
+
+            except Exception:
+                LOGGER.exception("Exception caught producing MLBlock %s", block_name)
+                raise
 
         return outputs
 
diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py
index 337116e7..c6e50790 100644
--- a/mlblocks/primitives.py
+++ b/mlblocks/primitives.py
@@ -8,9 +8,12 @@
 """
 
 import json
+import logging
 import os
 import sys
 
+LOGGER = logging.getLogger(__name__)
+
 _PRIMITIVES_PATHS = [
     os.path.join(os.getcwd(), 'mlblocks_primitives'),
     os.path.join(sys.prefix, 'mlblocks_primitives'),
@@ -35,6 +38,7 @@ def add_primitives_path(path):
         if not os.path.isdir(path):
             raise ValueError('Invalid path: {}'.format(path))
 
+        LOGGER.debug('Adding new primitives path %s', path)
         _PRIMITIVES_PATHS.insert(0, os.path.abspath(path))
 
 
@@ -73,6 +77,7 @@ def load_primitive(name):
         json_path = os.path.join(base_path, name + '.json')
         if os.path.isfile(json_path):
             with open(json_path, 'r') as json_file:
+                LOGGER.debug('Loading primitive %s from %s', name, json_path)
                 return json.load(json_file)
 
     raise ValueError("Unknown primitive: {}".format(name))

From a2cf239fd22d5d8c0e50eabef11d65f5f2c65bbd Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 9 Jan 2019 18:04:33 +0100
Subject: [PATCH 005/160] Filter conditionals from tunable hyperparameters

---
 mlblocks/mlblock.py   |  35 +++++--
 tests/test_mlblock.py | 219 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 247 insertions(+), 7 deletions(-)

diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 04a4bf55..618ebc75 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -110,6 +110,33 @@ def _extract_params(self, kwargs, hyperparameters):
 
         return init_params, fit_params, produce_params
 
+    @staticmethod
+    def _filter_conditional(conditional, init_params):
+        condition = conditional['condition']
+        if condition not in init_params:
+            return conditional
+
+        condition_value = init_params[condition]
+        values = conditional['values']
+        conditioned = values.get(condition_value) or values.get('*')
+        if conditioned:
+            return conditioned
+
+    @classmethod
+    def _get_tunable(cls, hyperparameters, init_params):
+        tunable = dict()
+        for name, param in hyperparameters.get('tunable', dict()).items():
+            if name not in init_params:
+                if param['type'] == 'conditional':
+                    param = cls._filter_conditional(param, init_params)
+                    if param is not None:
+                        tunable[name] = param
+
+                else:
+                    tunable[name] = param
+
+        return tunable
+
     def __init__(self, name, **kwargs):
 
         self.name = name
@@ -136,13 +163,7 @@ def __init__(self, name, **kwargs):
         self._fit_params = fit_params
         self._produce_params = produce_params
 
-        tunable = hyperparameters.get('tunable', dict())
-        self._tunable = {
-            name: param
-            for name, param in tunable.items()
-            if name not in init_params
-            # TODO: filter conditionals
-        }
+        self._tunable = self._get_tunable(hyperparameters, init_params)
 
         default = {
             name: param['default']
diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py
index abc235b0..970df5ed 100644
--- a/tests/test_mlblock.py
+++ b/tests/test_mlblock.py
@@ -23,6 +23,225 @@ class TestMLBlock(TestCase):
     def test__extract_params(self):
         pass
 
+    def test__get_tunable_no_conditionals(self):
+        """If there are no conditionals, tunables are returned unmodified."""
+
+        # setup
+        init_params = {
+            'an_init_param': 'a_value'
+        }
+        hyperparameters = {
+            'tunable': {
+                'this_is_not_conditional': {
+                    'type': 'int',
+                    'default': 1
+                }
+            }
+        }
+
+        # run
+        tunable = MLBlock._get_tunable(hyperparameters, init_params)
+
+        # assert
+        expected = {
+            'this_is_not_conditional': {
+                'type': 'int',
+                'default': 1
+            }
+        }
+        assert tunable == expected
+
+    def test__get_tunable_no_condition(self):
+        """If there is a conditiona but no condition, conditional is returned unmodified."""
+
+        # setup
+        init_params = {
+            'an_init_param': 'a_value'
+        }
+        hyperparameters = {
+            'tunable': {
+                'this_is_not_conditional': {
+                    'type': 'int',
+                    'default': 1
+                },
+                'this_is_conditional': {
+                    'type': 'conditional',
+                    'condition': 'a_condition',
+                    'default': 1,
+                    'values': {
+                        1: {
+                            'type': 'int',
+                            'default': 0
+                        },
+                        '*': {
+                            'type': 'str',
+                            'default': 'whatever'
+                        }
+                    }
+                }
+            }
+        }
+
+        # run
+        tunable = MLBlock._get_tunable(hyperparameters, init_params)
+
+        # assert
+        expected = {
+            'this_is_not_conditional': {
+                'type': 'int',
+                'default': 1
+            },
+            'this_is_conditional': {
+                'type': 'conditional',
+                'condition': 'a_condition',
+                'default': 1,
+                'values': {
+                    1: {
+                        'type': 'int',
+                        'default': 0
+                    },
+                    '*': {
+                        'type': 'str',
+                        'default': 'whatever'
+                    }
+                }
+            }
+        }
+        assert tunable == expected
+
+    def test__get_tunable_condition_match(self):
+        """If there is a conditional and it matches, only that part is returned."""
+
+        # setup
+        init_params = {
+            'a_condition': 'match'
+        }
+        hyperparameters = {
+            'tunable': {
+                'this_is_not_conditional': {
+                    'type': 'int',
+                    'default': 1
+                },
+                'this_is_conditional': {
+                    'type': 'conditional',
+                    'condition': 'a_condition',
+                    'default': 1,
+                    'values': {
+                        'match': {
+                            'type': 'int',
+                            'default': 0
+                        },
+                        '*': {
+                            'type': 'str',
+                            'default': 'whatever'
+                        }
+                    }
+                }
+            }
+        }
+
+        # run
+        tunable = MLBlock._get_tunable(hyperparameters, init_params)
+
+        # assert
+        expected = {
+            'this_is_not_conditional': {
+                'type': 'int',
+                'default': 1
+            },
+            'this_is_conditional': {
+                'type': 'int',
+                'default': 0
+            }
+        }
+        assert tunable == expected
+
+    def test__get_tunable_condition_wildcard_match(self):
+        """If there is a conditional and it matches the wildcard, only that part is returned."""
+
+        # setup
+        init_params = {
+            'a_condition': 'no_match'
+        }
+        hyperparameters = {
+            'tunable': {
+                'this_is_not_conditional': {
+                    'type': 'int',
+                    'default': 1
+                },
+                'this_is_conditional': {
+                    'type': 'conditional',
+                    'condition': 'a_condition',
+                    'default': 1,
+                    'values': {
+                        'match': {
+                            'type': 'int',
+                            'default': 0
+                        },
+                        '*': {
+                            'type': 'str',
+                            'default': 'whatever'
+                        }
+                    }
+                }
+            }
+        }
+
+        # run
+        tunable = MLBlock._get_tunable(hyperparameters, init_params)
+
+        # assert
+        expected = {
+            'this_is_not_conditional': {
+                'type': 'int',
+                'default': 1
+            },
+            'this_is_conditional': {
+                'type': 'str',
+                'default': 'whatever'
+            }
+        }
+        assert tunable == expected
+
+    def test__get_tunable_condition_no_match(self):
+        """If there is a conditional without match or wildcard, it is not returned."""
+
+        # setup
+        init_params = {
+            'a_condition': 'no_match'
+        }
+        hyperparameters = {
+            'tunable': {
+                'this_is_not_conditional': {
+                    'type': 'int',
+                    'default': 1
+                },
+                'this_is_conditional': {
+                    'type': 'conditional',
+                    'condition': 'a_condition',
+                    'default': 1,
+                    'values': {
+                        'match': {
+                            'type': 'int',
+                            'default': 0
+                        }
+                    }
+                }
+            }
+        }
+
+        # run
+        tunable = MLBlock._get_tunable(hyperparameters, init_params)
+
+        # assert
+        expected = {
+            'this_is_not_conditional': {
+                'type': 'int',
+                'default': 1
+            }
+        }
+        assert tunable == expected
+
     @patch('mlblocks.mlblock.MLBlock.set_hyperparameters')
     @patch('mlblocks.mlblock.import_object')
     @patch('mlblocks.mlblock.load_primitive')

From 31b36d4779e8faeb38449025aec30b0b90c51378 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 10 Jan 2019 17:59:03 +0100
Subject: [PATCH 006/160] Changed slightly the behavior of the conditional
 hyperparameters. Also include docs

---
 docs/advanced_usage/hyperparameters.rst |  19 ++-
 mlblocks/mlblock.py                     |   8 +-
 tests/test_mlblock.py                   | 192 +++++++++++++++++-------
 3 files changed, 156 insertions(+), 63 deletions(-)

diff --git a/docs/advanced_usage/hyperparameters.rst b/docs/advanced_usage/hyperparameters.rst
index bc31d4fd..71686ac5 100644
--- a/docs/advanced_usage/hyperparameters.rst
+++ b/docs/advanced_usage/hyperparameters.rst
@@ -165,6 +165,19 @@ Conditional Hyperparameters
 
 In some other cases, the values that a hyperparameter can take depend on the value of another
 one.
+For example, sometimes a primitive has a hyperparameter that specifies a kernel, and depending
+on the kernel used some other hyperparameters may be or not be used, or they might be able
+to take only some specific values.
+
+In this case, the ``type`` of the hyperparameter whose values depend on the other is specified
+as ``conditional``.
+In this case, two additional entries are required:
+
+* an entry called ``condition``, which specifies the name of the other hyperparameter, the value
+  of which is evaluated to decide which values this hyperparameter can take.
+* an additional subdictionary called ``values``, which relates the  possible values that the
+  `condition` hyperparameter can have with the full specifications of the type and values that
+  this hyperparameter can take in each case.
 
 Suppose, for example, that the primitive explained in the previous point does not expect
 the ``mean``, ``min`` or ``max`` strings as values for the ``max_features`` hyperparameter,
@@ -190,7 +203,7 @@ In this case, the hyperparameters would be annotated like this::
     }
     "max_features_aggregation": {
         "type": "conditional",
-        "condition": "mas_features",
+        "condition": "max_features",
         "default": null,
         "values": {
             "auto": {
@@ -202,6 +215,10 @@ In this case, the hyperparameters would be annotated like this::
         }
     }
 
+.. note:: Just like a regular hyperparameter, if there is no match the default entry is used.
+          In this example, the ``null`` value indicates that the hyperparameter needs to be
+          disabled if there is no match, but instead of it we could add there a full specification
+          of type, range and default value as a nested dictionary to be used by default.
 
 .. _JSON Annotations: primitives.html#json-annotations
 .. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 618ebc75..a5cdb6a4 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -113,14 +113,14 @@ def _extract_params(self, kwargs, hyperparameters):
     @staticmethod
     def _filter_conditional(conditional, init_params):
         condition = conditional['condition']
+        default = conditional.get('default')
+
         if condition not in init_params:
-            return conditional
+            return default
 
         condition_value = init_params[condition]
         values = conditional['values']
-        conditioned = values.get(condition_value) or values.get('*')
-        if conditioned:
-            return conditioned
+        return values.get(condition_value, default)
 
     @classmethod
     def _get_tunable(cls, hyperparameters, init_params):
diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py
index 970df5ed..5273d40c 100644
--- a/tests/test_mlblock.py
+++ b/tests/test_mlblock.py
@@ -34,7 +34,8 @@ def test__get_tunable_no_conditionals(self):
             'tunable': {
                 'this_is_not_conditional': {
                     'type': 'int',
-                    'default': 1
+                    'default': 1,
+                    'range': [1, 10]
                 }
             }
         }
@@ -46,13 +47,14 @@ def test__get_tunable_no_conditionals(self):
         expected = {
             'this_is_not_conditional': {
                 'type': 'int',
-                'default': 1
+                'default': 1,
+                'range': [1, 10]
             }
         }
         assert tunable == expected
 
     def test__get_tunable_no_condition(self):
-        """If there is a conditiona but no condition, conditional is returned unmodified."""
+        """If there is a conditional but no condition, the default is used."""
 
         # setup
         init_params = {
@@ -62,20 +64,27 @@ def test__get_tunable_no_condition(self):
             'tunable': {
                 'this_is_not_conditional': {
                     'type': 'int',
-                    'default': 1
+                    'default': 1,
+                    'range': [1, 10]
                 },
                 'this_is_conditional': {
                     'type': 'conditional',
                     'condition': 'a_condition',
-                    'default': 1,
+                    'default': {
+                        'type': 'float',
+                        'default': 0.1,
+                        'values': [0, 1]
+                    },
                     'values': {
-                        1: {
-                            'type': 'int',
-                            'default': 0
-                        },
-                        '*': {
+                        'not_a_match': {
                             'type': 'str',
-                            'default': 'whatever'
+                            'default': 'a',
+                            'values': ['a', 'b']
+                        },
+                        'neither_a_match': {
+                            'type': 'int',
+                            'default': 0,
+                            'range': [1, 10]
                         }
                     }
                 }
@@ -89,22 +98,13 @@ def test__get_tunable_no_condition(self):
         expected = {
             'this_is_not_conditional': {
                 'type': 'int',
-                'default': 1
+                'default': 1,
+                'range': [1, 10]
             },
             'this_is_conditional': {
-                'type': 'conditional',
-                'condition': 'a_condition',
-                'default': 1,
-                'values': {
-                    1: {
-                        'type': 'int',
-                        'default': 0
-                    },
-                    '*': {
-                        'type': 'str',
-                        'default': 'whatever'
-                    }
-                }
+                'type': 'float',
+                'default': 0.1,
+                'values': [0, 1]
             }
         }
         assert tunable == expected
@@ -114,26 +114,33 @@ def test__get_tunable_condition_match(self):
 
         # setup
         init_params = {
-            'a_condition': 'match'
+            'a_condition': 'a_match'
         }
         hyperparameters = {
             'tunable': {
                 'this_is_not_conditional': {
                     'type': 'int',
-                    'default': 1
+                    'default': 1,
+                    'range': [1, 10]
                 },
                 'this_is_conditional': {
                     'type': 'conditional',
                     'condition': 'a_condition',
-                    'default': 1,
+                    'default': {
+                        'type': 'float',
+                        'default': 0.1,
+                        'values': [0, 1]
+                    },
                     'values': {
-                        'match': {
-                            'type': 'int',
-                            'default': 0
-                        },
-                        '*': {
+                        'not_a_match': {
                             'type': 'str',
-                            'default': 'whatever'
+                            'default': 'a',
+                            'values': ['a', 'b']
+                        },
+                        'a_match': {
+                            'type': 'int',
+                            'default': 0,
+                            'range': [1, 10]
                         }
                     }
                 }
@@ -147,40 +154,49 @@ def test__get_tunable_condition_match(self):
         expected = {
             'this_is_not_conditional': {
                 'type': 'int',
-                'default': 1
+                'default': 1,
+                'range': [1, 10]
             },
             'this_is_conditional': {
                 'type': 'int',
-                'default': 0
+                'default': 0,
+                'range': [1, 10]
             }
         }
         assert tunable == expected
 
-    def test__get_tunable_condition_wildcard_match(self):
-        """If there is a conditional and it matches the wildcard, only that part is returned."""
+    def test__get_tunable_condition_no_match(self):
+        """If there is a conditional and it does not match, the default is used."""
 
         # setup
         init_params = {
-            'a_condition': 'no_match'
+            'a_condition': 'not_a_match'
         }
         hyperparameters = {
             'tunable': {
                 'this_is_not_conditional': {
                     'type': 'int',
-                    'default': 1
+                    'default': 1,
+                    'range': [1, 10]
                 },
                 'this_is_conditional': {
                     'type': 'conditional',
                     'condition': 'a_condition',
-                    'default': 1,
+                    'default': {
+                        'type': 'float',
+                        'default': 0.1,
+                        'values': [0, 1]
+                    },
                     'values': {
-                        'match': {
-                            'type': 'int',
-                            'default': 0
-                        },
-                        '*': {
+                        'also_not_a_match': {
                             'type': 'str',
-                            'default': 'whatever'
+                            'default': 'a',
+                            'values': ['a', 'b']
+                        },
+                        'neither_a_match': {
+                            'type': 'int',
+                            'default': 0,
+                            'range': [1, 10]
                         }
                     }
                 }
@@ -194,36 +210,45 @@ def test__get_tunable_condition_wildcard_match(self):
         expected = {
             'this_is_not_conditional': {
                 'type': 'int',
-                'default': 1
+                'default': 1,
+                'range': [1, 10]
             },
             'this_is_conditional': {
-                'type': 'str',
-                'default': 'whatever'
+                'type': 'float',
+                'default': 0.1,
+                'values': [0, 1]
             }
         }
         assert tunable == expected
 
-    def test__get_tunable_condition_no_match(self):
-        """If there is a conditional without match or wildcard, it is not returned."""
+    def test__get_tunable_condition_default_null(self):
+        """If there is no match and default is null (None), this param is not included."""
 
         # setup
         init_params = {
-            'a_condition': 'no_match'
+            'a_condition': 'not_a_match'
         }
         hyperparameters = {
             'tunable': {
                 'this_is_not_conditional': {
                     'type': 'int',
-                    'default': 1
+                    'default': 1,
+                    'range': [1, 10]
                 },
                 'this_is_conditional': {
                     'type': 'conditional',
                     'condition': 'a_condition',
-                    'default': 1,
+                    'default': None,
                     'values': {
-                        'match': {
+                        'also_not_a_match': {
+                            'type': 'str',
+                            'default': 'a',
+                            'values': ['a', 'b']
+                        },
+                        'neither_a_match': {
                             'type': 'int',
-                            'default': 0
+                            'default': 0,
+                            'range': [1, 10]
                         }
                     }
                 }
@@ -237,7 +262,58 @@ def test__get_tunable_condition_no_match(self):
         expected = {
             'this_is_not_conditional': {
                 'type': 'int',
-                'default': 1
+                'default': 1,
+                'range': [1, 10]
+            }
+        }
+        assert tunable == expected
+
+    def test__get_tunable_condition_match_null(self):
+        """If there is a match and it is null (None), this param is not included.
+
+        This stands even if the default is not null.
+        """
+
+        # setup
+        init_params = {
+            'a_condition': 'a_match'
+        }
+        hyperparameters = {
+            'tunable': {
+                'this_is_not_conditional': {
+                    'type': 'int',
+                    'default': 1,
+                    'range': [1, 10]
+                },
+                'this_is_conditional': {
+                    'type': 'conditional',
+                    'condition': 'a_condition',
+                    'default': {
+                        'type': 'float',
+                        'default': 0.1,
+                        'values': [0, 1]
+                    },
+                    'values': {
+                        'not_a_match': {
+                            'type': 'str',
+                            'default': 'a',
+                            'values': ['a', 'b']
+                        },
+                        'a_match': None
+                    }
+                }
+            }
+        }
+
+        # run
+        tunable = MLBlock._get_tunable(hyperparameters, init_params)
+
+        # assert
+        expected = {
+            'this_is_not_conditional': {
+                'type': 'int',
+                'default': 1,
+                'range': [1, 10]
             }
         }
         assert tunable == expected

From 0ede2707da8e4d2866416cac28e2b03c69a68e47 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 10 Jan 2019 18:26:21 +0100
Subject: [PATCH 007/160] Release notes for v0.3.0

---
 HISTORY.md           | 7 +++++++
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 7 +++----
 setup.py             | 2 +-
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index d08624dc..a312c9cb 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,13 @@
 Changelog
 =========
 
+0.3.0 - New Primitives Discovery
+--------------------------------
+
+* New primitives discovery system based on `entry_points`.
+* Conditional Hyperparameters filtering in MLBlock initialization.
+* Improved logging and exception reporting.
+
 0.2.4 - New Datasets and Unit Tests
 -----------------------------------
 
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 43079986..3a9e6bcb 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -18,7 +18,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.2.5-dev'
+__version__ = '0.3.0-dev'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index fbc69b07..a9255027 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,15 +1,15 @@
 [bumpversion]
-current_version = 0.2.5-dev
+current_version = 0.3.0-dev
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
-serialize = 
+serialize =
 	{major}.{minor}.{patch}-{release}
 	{major}.{minor}.{patch}
 
 [bumpversion:part:release]
 optional_value = release
-values = 
+values =
 	dev
 	release
 
@@ -45,4 +45,3 @@ collect_ignore = ['setup.py']
 
 [tool:pylint]
 good-names = X,y
-
diff --git a/setup.py b/setup.py
index 9d4b4cfc..5c21f44b 100644
--- a/setup.py
+++ b/setup.py
@@ -92,6 +92,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/HDI-Project/MLBlocks',
-    version='0.2.5-dev',
+    version='0.3.0-dev',
     zip_safe=False,
 )

From bb0bb0d44bcc44e1517825409e1d092670ddde27 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 10 Jan 2019 18:33:45 +0100
Subject: [PATCH 008/160] =?UTF-8?q?Bump=20version:=200.3.0-dev=20=E2=86=92?=
 =?UTF-8?q?=200.3.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 7 ++++---
 setup.py             | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 3a9e6bcb..93bd80bb 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -18,7 +18,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.3.0-dev'
+__version__ = '0.3.0'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index a9255027..3026d2ba 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,15 +1,15 @@
 [bumpversion]
-current_version = 0.3.0-dev
+current_version = 0.3.0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
-serialize =
+serialize = 
 	{major}.{minor}.{patch}-{release}
 	{major}.{minor}.{patch}
 
 [bumpversion:part:release]
 optional_value = release
-values =
+values = 
 	dev
 	release
 
@@ -45,3 +45,4 @@ collect_ignore = ['setup.py']
 
 [tool:pylint]
 good-names = X,y
+
diff --git a/setup.py b/setup.py
index 5c21f44b..a59a74f0 100644
--- a/setup.py
+++ b/setup.py
@@ -92,6 +92,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/HDI-Project/MLBlocks',
-    version='0.3.0-dev',
+    version='0.3.0',
     zip_safe=False,
 )

From e1ca77bce3c4537c0800a4c1395e1b6bbde5465d Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 10 Jan 2019 18:34:07 +0100
Subject: [PATCH 009/160] =?UTF-8?q?Bump=20version:=200.3.0=20=E2=86=92=200?=
 =?UTF-8?q?.3.1-dev?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 93bd80bb..cf326495 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -18,7 +18,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.3.0'
+__version__ = '0.3.1-dev'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 3026d2ba..e976dec7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.0
+current_version = 0.3.1-dev
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
diff --git a/setup.py b/setup.py
index a59a74f0..a8ac84d7 100644
--- a/setup.py
+++ b/setup.py
@@ -92,6 +92,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/HDI-Project/MLBlocks',
-    version='0.3.0',
+    version='0.3.1-dev',
     zip_safe=False,
 )

From d3cbee730139b2d0117a1de1474a581844505196 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Fri, 19 Apr 2019 13:38:02 +0200
Subject: [PATCH 010/160] Initial implementation to work with intermediate
 outputs

---
 mlblocks/mlpipeline.py | 82 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 71 insertions(+), 11 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 058737ee..d5928b69 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -166,7 +166,7 @@ def _get_block_args(self, block_name, block_args, context):
 
         return kwargs
 
-    def _get_outputs(self, block_name, outputs, block_outputs):
+    def _extract_outputs(self, block_name, outputs, block_outputs):
         # TODO: type validation and/or transformation should be done here
 
         if not isinstance(outputs, tuple):
@@ -188,7 +188,40 @@ def _get_outputs(self, block_name, outputs, block_outputs):
 
         return output_dict
 
-    def fit(self, X=None, y=None, **kwargs):
+    def _get_block_name(self, index):
+        return list(self.blocks.keys())[index]
+
+    def _get_output_spec(self, output):
+        if output is None:
+            return None, None
+
+        if isinstance(output, int):
+            output = self._get_block_name(output)
+
+        if output in self.blocks:
+            return output, None
+
+        if '.' in output:
+            output_block, output_variable = output.rsplit('.', 1)
+            if output_block not in self.blocks:
+                raise ValueError('Unknown block name: {}'.format(output_block))
+
+            return output_block, output_variable
+
+        last_block_name = self._get_block_name(-1)
+        return last_block_name, output
+
+    def _get_output(self, output_variable, context):
+        if output_variable:
+            if output_variable not in context:
+                raise ValueError('Output variable {} not found in context'
+                                 .format(output_variable))
+
+            return context[output_variable]
+        else:
+            return context
+
+    def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs):
         """Fit the blocks of this pipeline.
 
         Sequentially call the `fit` and the `produce` methods of each block,
@@ -213,8 +246,19 @@ def fit(self, X=None, y=None, **kwargs):
         }
         context.update(kwargs)
 
-        last_block_name = list(self.blocks.keys())[-1]
+        output_block, output_variable = self._get_output_spec(output)
+        last_block_name = self._get_block_name(-1)
+
+        if isinstance(skip_to, int):
+            skip_to = self._get_block_name(skip_to)
+
         for block_name, block in self.blocks.items():
+            if block_name == skip_to:
+                skip_to = False
+            elif skip_to:
+                LOGGER.debug("Skipping block %s fit", block_name)
+                continue
+
             LOGGER.debug("Fitting block %s", block_name)
             try:
                 fit_args = self._get_block_args(block_name, block.fit_args, context)
@@ -223,19 +267,22 @@ def fit(self, X=None, y=None, **kwargs):
                 LOGGER.exception("Exception caught fitting MLBlock %s", block_name)
                 raise
 
-            if block_name != last_block_name:
+            if (block_name != last_block_name) or (block_name == output_block):
                 LOGGER.debug("Producing block %s", block_name)
                 try:
                     produce_args = self._get_block_args(block_name, block.produce_args, context)
                     outputs = block.produce(**produce_args)
 
-                    output_dict = self._get_outputs(block_name, outputs, block.produce_output)
+                    output_dict = self._extract_outputs(block_name, outputs, block.produce_output)
                     context.update(output_dict)
                 except Exception:
                     LOGGER.exception("Exception caught producing MLBlock %s", block_name)
                     raise
 
-    def predict(self, X=None, **kwargs):
+            if block_name == output_block:
+                return self._get_output(output_variable, context)
+
+    def predict(self, X=None, output='y', skip_to=None, **kwargs):
         """Produce predictions using the blocks of this pipeline.
 
         Sequentially call the `produce` method of each block, capturing the
@@ -256,22 +303,35 @@ def predict(self, X=None, **kwargs):
         }
         context.update(kwargs)
 
-        last_block_name = list(self.blocks.keys())[-1]
+        output_block, output_variable = self._get_output_spec(output)
+
+        if isinstance(skip_to, int):
+            skip_to = self._get_block_name(skip_to)
+
         for block_name, block in self.blocks.items():
+            if block_name == skip_to:
+                skip_to = False
+            elif skip_to:
+                LOGGER.debug("Skipping block %s produce", block_name)
+                continue
+
             LOGGER.debug("Producing block %s", block_name)
             try:
                 produce_args = self._get_block_args(block_name, block.produce_args, context)
                 outputs = block.produce(**produce_args)
+                output_dict = self._extract_outputs(block_name, outputs, block.produce_output)
+                context.update(output_dict)
 
-                if block_name != last_block_name:
-                    output_dict = self._get_outputs(block_name, outputs, block.produce_output)
-                    context.update(output_dict)
+                if block_name == output_block:
+                    return self._get_output(output_variable, context)
 
             except Exception:
                 LOGGER.exception("Exception caught producing MLBlock %s", block_name)
                 raise
 
-        return outputs
+        if skip_to:
+            # We skipped all the blocks up to the end
+            raise ValueError('Unknown block name: {}'.format(skip_to))
 
     def to_dict(self):
         """Return all the details of this MLPipeline in a dict.

From 59fae909d44afb78005425c6c4a24de567391eb5 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 6 May 2019 22:48:38 +0200
Subject: [PATCH 011/160] Update contributing guide to match the current
 release workflow

---
 CONTRIBUTING.rst | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index 2db74080..4fce53bf 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -172,24 +172,26 @@ The process of releasing a new version involves several steps combining both ``g
 
 1. Merge what is in ``master`` branch into ``stable`` branch.
 2. Update the version in ``setup.cfg``, ``mlblocks/__init__.py`` and ``HISTORY.md`` files.
-3. Create a new TAG pointing at the correspoding commit in ``stable`` branch.
+3. Create a new git tag pointing at the corresponding commit in ``stable`` branch.
 4. Merge the new commit from ``stable`` into ``master``.
-5. Update the version in ``setup.cfg`` and ``mlblocks/__init__.py`` to open the next
-   development interation.
+5. Update the version in ``setup.cfg`` and ``mlblocks/__init__.py``
+   to open the next development iteration.
 
-**Note:** Before starting the process, make sure that ``HISTORY.md`` has a section titled
-**Unreleased** with the list of changes that will be included in the new version, and that
-these changes are committed and available in ``master`` branch.
-Normally this is just a list of the Pull Requests that have been merged since the latest version.
+.. note:: Before starting the process, make sure that ``HISTORY.md`` has been updated with a new
+          entry that explains the changes that will be included in the new version.
+          Normally this is just a list of the Pull Requests that have been merged to master
+          since the last release.
 
-Once this is done, just run the following commands::
+Once this is done, run of the following commands:
+
+1. If you are releasing a patch version::
 
-    git checkout stable
-    git merge --no-ff master    # This creates a merge commit
-    bumpversion release   # This creates a new commit and a TAG
-    git push --tags origin stable
     make release
-    git checkout master
-    git merge stable
-    bumpversion --no-tag patch
-    git push
+
+2. If you are releasing a minor version::
+
+    make release-minor
+
+3. If you are releasing a major version::
+
+    make release-major

From e768037076387fcb9a33e494c9c89421f0c657a8 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 6 May 2019 22:49:47 +0200
Subject: [PATCH 012/160] Update docs config

---
 Makefile           |  1 -
 docs/changelog.rst |  2 +-
 docs/conf.py       | 20 +++++++-------------
 setup.py           |  1 -
 4 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index dc62e90d..c2d2aaa4 100644
--- a/Makefile
+++ b/Makefile
@@ -122,7 +122,6 @@ coverage: ## check code coverage quickly with the default Python
 .PHONY: docs
 docs: clean-docs ## generate Sphinx HTML documentation, including API docs
 	$(MAKE) -C docs html
-	touch docs/_build/html/.nojekyll
 
 .PHONY: view-docs
 view-docs: docs ## view docs in browser
diff --git a/docs/changelog.rst b/docs/changelog.rst
index fcd2eb2d..d26e5be8 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1 +1 @@
-.. include:: ../HISTORY.md
+.. mdinclude:: ../HISTORY.md
diff --git a/docs/conf.py b/docs/conf.py
index 8659996f..9b4595ec 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -18,18 +18,9 @@
 # relative to the documentation root, use os.path.abspath to make it
 # absolute, like shown here.
 
-import os
-import sys
-
 import sphinx_rtd_theme # For read the docs theme
-from recommonmark.parser import CommonMarkParser
-# from recommonmark.transform import AutoStructify
-
-# sys.path.insert(0, os.path.abspath('..'))
 
 import mlblocks
-# 
-# mlblocks.add_primitives_path('../mlblocks_primitives')
 
 # -- General configuration ---------------------------------------------
 
@@ -40,8 +31,11 @@
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 extensions = [
-    'sphinx.ext.napoleon',
+    'm2r',
+    'sphinx.ext.autodoc',
     'sphinx.ext.githubpages',
+    'sphinx.ext.viewcode',
+    'sphinx.ext.napoleon',
     'sphinx.ext.graphviz',
     'IPython.sphinxext.ipython_console_highlighting',
     'IPython.sphinxext.ipython_directive',
@@ -56,9 +50,9 @@
 # You can specify multiple suffix as a list of string:
 source_suffix = ['.rst', '.md', '.ipynb']
 
-source_parsers = {
-    '.md': CommonMarkParser,
-}
+# source_parsers = {
+#     '.md': CommonMarkParser,
+# }
 
 # The master toctree document.
 master_doc = 'index'
diff --git a/setup.py b/setup.py
index a8ac84d7..f6991ab1 100644
--- a/setup.py
+++ b/setup.py
@@ -43,7 +43,6 @@
     'graphviz==0.9',
     'ipython==6.5.0',
     'matplotlib==2.2.3',
-    'recommonmark>=0.4.0',
 
     # style check
     'flake8>=3.5.0',

From 080580d45c9b47680fbc31d30aee4e8478292711 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 6 May 2019 22:50:08 +0200
Subject: [PATCH 013/160] Remove spaces

---
 setup.cfg | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index e976dec7..62ced521 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,13 +3,13 @@ current_version = 0.3.1-dev
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
-serialize = 
+serialize =
 	{major}.{minor}.{patch}-{release}
 	{major}.{minor}.{patch}
 
 [bumpversion:part:release]
 optional_value = release
-values = 
+values =
 	dev
 	release
 
@@ -45,4 +45,3 @@ collect_ignore = ['setup.py']
 
 [tool:pylint]
 good-names = X,y
-

From e25fa6d3ac3af2f20b205ed73d91d28124bc8c16 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 6 May 2019 22:50:32 +0200
Subject: [PATCH 014/160] ADd docstrings

---
 mlblocks/mlpipeline.py | 127 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 113 insertions(+), 14 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index d5928b69..abbac922 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -69,6 +69,7 @@ class MLPipeline():
     """
 
     def _get_tunable_hyperparameters(self):
+        """Get the tunable hyperperparameters from all the blocks in this pipeline."""
         tunable = {}
         for block_name, block in self.blocks.items():
             tunable[block_name] = block.get_tunable_hyperparameters()
@@ -140,6 +141,24 @@ def set_hyperparameters(self, hyperparameters):
             self.blocks[block_name].set_hyperparameters(block_hyperparams)
 
     def _get_block_args(self, block_name, block_args, context):
+        """Get the arguments expected by the block method from the context.
+
+        The arguments will be taken from the context using both the method
+        arguments specification and the `input_names` given when the pipeline
+        was created.
+
+        Args:
+            block_name (str): Name of this block. Used to find the corresponding
+                              input_names.
+            block_args (list): list of method argument specifications from the
+                               primitive.
+            context (dict): current context dictionary.
+
+        Returns:
+            dict:
+                A dictionary containing the argument names and values to pass
+                to the method.
+        """
         # TODO: type validation and/or transformation should be done here
 
         input_names = self.input_names.get(block_name, dict())
@@ -167,6 +186,7 @@ def _get_block_args(self, block_name, block_args, context):
         return kwargs
 
     def _extract_outputs(self, block_name, outputs, block_outputs):
+        """Extract the outputs of the method as a dict to be set into the context."""
         # TODO: type validation and/or transformation should be done here
 
         if not isinstance(outputs, tuple):
@@ -189,9 +209,36 @@ def _extract_outputs(self, block_name, outputs, block_outputs):
         return output_dict
 
     def _get_block_name(self, index):
+        """Get the name of the block in the `index` position."""
         return list(self.blocks.keys())[index]
 
     def _get_output_spec(self, output):
+        """Parsre the output specification and get a block name and a variable name.
+
+        The output specification can be of two types: int and str.
+
+        If it is an integer, it is interpreted as a block index, and the variable name
+        is considered to be ``None``, which means that the whole context will be returned.
+
+        If it is a string, it is interpreted as the block name, and it has to match a block
+        name exactly, including its hash and counter number ``#n``. Optionally, a variable
+        name can be passed at the end using a ``'.'`` as a separator.
+        In this case, the format of the string is `{block_name}.{variable_name}`. Note
+        that the block name can also contain dots, so only the leftmost dot will be
+        considered, and only if the complete string does not match exactly a block name.
+
+        Args:
+            output (str or int): Output specification as either a string or an integer.
+
+        Returns:
+            tuple:
+                The output is a tuple containing:
+                    * block_name (str): name of the block from which the output will be
+                        returned, including its counter number.
+                    * variable_name (str): Name of the variable to extract from the context.
+                        It can be ``None``, which means that the whole context is to be
+                        returned.
+        """
         if output is None:
             return None, None
 
@@ -212,6 +259,10 @@ def _get_output_spec(self, output):
         return last_block_name, output
 
     def _get_output(self, output_variable, context):
+        """Get the specified output variable from the context.
+
+        If the variable name is ``None``, return the entire context.
+        """
         if output_variable:
             if output_variable not in context:
                 raise ValueError('Output variable {} not found in context'
@@ -221,7 +272,7 @@ def _get_output(self, output_variable, context):
         else:
             return context
 
-    def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs):
+    def fit(self, X=None, y=None, output=None, start_on=None, **kwargs):
         """Fit the blocks of this pipeline.
 
         Sequentially call the `fit` and the `produce` methods of each block,
@@ -237,8 +288,32 @@ def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs):
             X: Fit Data, which the pipeline will learn from.
             y: Fit Data labels, which the pipeline will use to learn how to
                behave.
+            output (str or int): Output specification, which can be a string or an integer.
+                If an integer is given, it is interpreted as the block number, and the whole
+                context after running the specified block will be returned.
+                If a string is given, it is expected to be the name of one block, including
+                its counter number at the end. Optionally, a variable name can be included
+                at the end after the counter number using a ``'.'`` as a separator between the
+                block name and the variable name. If the variable name is given, this will be
+                extracted from the context and returned. Otherwise, the whole context will
+                be returned.
+            start_on (str or int): Block index or block name to start processing from. The
+                value can either be an integer, which will be interpreted as a block index,
+                or the name of a block, including the conter number at the end.
+                If given, the execution of the pipeline will start on the specified block,
+                and all the blocks before that one will be skipped.
             **kwargs: Any additional keyword arguments will be directly added
                       to the context dictionary and available for the blocks.
+
+        Returns:
+            None or dict or object:
+                * If no output is specified, nothing will be returned.
+                * If an output block has been specified without and output variable, the
+                  context dictionary will be returned after the produce method of that block
+                  has been called.
+                * If both an output block and an output variable have been specified,
+                  the value of that variable from the context will extracted and returned
+                  after the produce method of that block has been called.
         """
         context = {
             'X': X,
@@ -249,13 +324,13 @@ def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs):
         output_block, output_variable = self._get_output_spec(output)
         last_block_name = self._get_block_name(-1)
 
-        if isinstance(skip_to, int):
-            skip_to = self._get_block_name(skip_to)
+        if isinstance(start_on, int):
+            start_on = self._get_block_name(start_on)
 
         for block_name, block in self.blocks.items():
-            if block_name == skip_to:
-                skip_to = False
-            elif skip_to:
+            if block_name == start_on:
+                start_on = False
+            elif start_on:
                 LOGGER.debug("Skipping block %s fit", block_name)
                 continue
 
@@ -282,7 +357,7 @@ def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs):
             if block_name == output_block:
                 return self._get_output(output_variable, context)
 
-    def predict(self, X=None, output='y', skip_to=None, **kwargs):
+    def predict(self, X=None, output='y', start_on=None, **kwargs):
         """Produce predictions using the blocks of this pipeline.
 
         Sequentially call the `produce` method of each block, capturing the
@@ -295,8 +370,32 @@ def predict(self, X=None, output='y', skip_to=None, **kwargs):
 
         Args:
             X: Data which the pipeline will use to make predictions.
+            output (str or int): Output specification, which can be a string or an integer.
+                If an integer is given, it is interpreted as the block number, and the whole
+                context after running the specified block will be returned.
+                If a string is given, it is expected to be the name of one block, including
+                its counter number at the end. Optionally, a variable name can be included
+                at the end after the counter number using a ``'.'`` as a separator between the
+                block name and the variable name. If the variable name is given, this will be
+                extracted from the context and returned. Otherwise, the whole context will
+                be returned.
+            start_on (str or int): Block index or block name to start processing from. The
+                value can either be an integer, which will be interpreted as a block index,
+                or the name of a block, including the conter number at the end.
+                If given, the execution of the pipeline will start on the specified block,
+                and all the blocks before that one will be skipped.
             **kwargs: Any additional keyword arguments will be directly added
                       to the context dictionary and available for the blocks.
+
+        Returns:
+            None or dict or object:
+                * If no output is specified, the output of the last block will be returned.
+                * If an output block has been specified without and output variable, the
+                  context dictionary will be returned after the produce method of that block
+                  has been called.
+                * If both an output block and an output variable have been specified,
+                  the value of that variable from the context will extracted and returned
+                  after the produce method of that block has been called.
         """
         context = {
             'X': X
@@ -305,13 +404,13 @@ def predict(self, X=None, output='y', skip_to=None, **kwargs):
 
         output_block, output_variable = self._get_output_spec(output)
 
-        if isinstance(skip_to, int):
-            skip_to = self._get_block_name(skip_to)
+        if isinstance(start_on, int):
+            start_on = self._get_block_name(start_on)
 
         for block_name, block in self.blocks.items():
-            if block_name == skip_to:
-                skip_to = False
-            elif skip_to:
+            if block_name == start_on:
+                start_on = False
+            elif start_on:
                 LOGGER.debug("Skipping block %s produce", block_name)
                 continue
 
@@ -329,9 +428,9 @@ def predict(self, X=None, output='y', skip_to=None, **kwargs):
                 LOGGER.exception("Exception caught producing MLBlock %s", block_name)
                 raise
 
-        if skip_to:
+        if start_on:
             # We skipped all the blocks up to the end
-            raise ValueError('Unknown block name: {}'.format(skip_to))
+            raise ValueError('Unknown block name: {}'.format(start_on))
 
     def to_dict(self):
         """Return all the details of this MLPipeline in a dict.

From 5e9be7aa7188d38ca6eafb684c24171b9e61f322 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 6 May 2019 22:51:09 +0200
Subject: [PATCH 015/160] Update primitive names to match the latest versions
 of MLPrimitives

---
 docs/getting_started/quickstart.rst |  2 +-
 docs/pipeline_examples/graph.rst    |  4 ++--
 docs/pipeline_examples/text.rst     | 22 +++++++++++-----------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
index 2e00ece6..2115fcef 100644
--- a/docs/getting_started/quickstart.rst
+++ b/docs/getting_started/quickstart.rst
@@ -24,7 +24,7 @@ them to the `MLPipeline class`_:
 
     from mlblocks import MLPipeline
     primitives = [
-        'mlprimitives.feature_extraction.StringVectorizer',
+        'mlprimitives.custom.feature_extraction.StringVectorizer',
         'sklearn.ensemble.RandomForestClassifier',
     ]
     pipeline = MLPipeline(primitives)
diff --git a/docs/pipeline_examples/graph.rst b/docs/pipeline_examples/graph.rst
index 5503e739..54ef85a1 100644
--- a/docs/pipeline_examples/graph.rst
+++ b/docs/pipeline_examples/graph.rst
@@ -39,7 +39,7 @@ additional information not found inside `X`.
 
     primitives = [
         'networkx.link_prediction_feature_extraction',
-        'mlprimitives.feature_extraction.CategoricalEncoder',
+        'mlprimitives.custom.feature_extraction.CategoricalEncoder',
         'sklearn.preprocessing.StandardScaler',
         'xgboost.XGBClassifier'
     ]
@@ -69,6 +69,6 @@ additional information not found inside `X`.
 
 
 .. _NetworkX Link Prediction: https://networkx.github.io/documentation/networkx-1.10/reference/algorithms.link_prediction.html
-.. _CategoricalEncoder from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.feature_extraction.CategoricalEncoder.json
+.. _CategoricalEncoder from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json
 .. _StandardScaler from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
 .. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
diff --git a/docs/pipeline_examples/text.rst b/docs/pipeline_examples/text.rst
index df8a9d5a..03472ea3 100644
--- a/docs/pipeline_examples/text.rst
+++ b/docs/pipeline_examples/text.rst
@@ -40,31 +40,31 @@ for later ones.
 
     # set up the pipeline
     primitives = [
-        "mlprimitives.counters.UniqueCounter",
-        "mlprimitives.text.TextCleaner",
-        "mlprimitives.counters.VocabularyCounter",
+        "mlprimitives.custom.counters.UniqueCounter",
+        "mlprimitives.custom.text.TextCleaner",
+        "mlprimitives.custom.counters.VocabularyCounter",
         "keras.preprocessing.text.Tokenizer",
         "keras.preprocessing.sequence.pad_sequences",
         "keras.Sequential.LSTMTextClassifier"
     ]
     input_names = {
-        "mlprimitives.counters.UniqueCounter#1": {
+        "mlprimitives.custom.counters.UniqueCounter#1": {
             "X": "y"
         }
     }
     output_names = {
-        "mlprimitives.counters.UniqueCounter#1": {
+        "mlprimitives.custom.counters.UniqueCounter#1": {
             "counts": "classes"
         },
-        "mlprimitives.counters.VocabularyCounter#1": {
+        "mlprimitives.custom.counters.VocabularyCounter#1": {
             "counts": "vocabulary_size"
         }
     }
     init_params = {
-        "mlprimitives.counters.VocabularyCounter#1": {
+        "mlprimitives.custom.counters.VocabularyCounter#1": {
             "add": 1
         },
-        "mlprimitives.text.TextCleaner#1": {
+        "mlprimitives.custom.text.TextCleaner#1": {
             "language": "en"
         },
         "keras.preprocessing.sequence.pad_sequences#1": {
@@ -116,12 +116,12 @@ to encode all the string features, and go directly into the
     nltk.download('stopwords')
 
     primitives = [
-        'mlprimitives.text.TextCleaner',
-        'mlprimitives.feature_extraction.StringVectorizer',
+        'mlprimitives.custom.text.TextCleaner',
+        'mlprimitives.custom.feature_extraction.StringVectorizer',
         'sklearn.ensemble.RandomForestClassifier',
     ]
     init_params = {
-        'mlprimitives.text.TextCleaner': {
+        'mlprimitives.custom.text.TextCleaner': {
             'column': 'text',
             'language': 'nl'
         },

From 9f0ae6a3fa000896d8f530b72f6da46d23c31e4b Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 7 May 2019 17:12:33 +0200
Subject: [PATCH 016/160] Add random state to datasets get_splits

---
 mlblocks/datasets.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py
index b5ed6b46..fb32df9c 100644
--- a/mlblocks/datasets.py
+++ b/mlblocks/datasets.py
@@ -141,7 +141,7 @@ def _get_split(data, index):
         else:
             return data[index]
 
-    def get_splits(self, n_splits=1):
+    def get_splits(self, n_splits=1, random_state=0):
         """Return splits of this dataset ready for Cross Validation.
 
         If n_splits is 1, a tuple containing the X for train and test
@@ -166,12 +166,13 @@ def get_splits(self, n_splits=1):
                 self.data,
                 self.target,
                 shuffle=self._shuffle,
-                stratify=stratify
+                stratify=stratify,
+                random_state=random_state
             )
 
         else:
             cv_class = StratifiedKFold if self._stratify else KFold
-            cv = cv_class(n_splits=n_splits, shuffle=self._shuffle)
+            cv = cv_class(n_splits=n_splits, shuffle=self._shuffle, random_state=random_state)
 
             splits = list()
             for train, test in cv.split(self.data, self.target):

From 5aea64755b7b7f9b4e68f6faa9a0912c1a55033a Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 7 May 2019 17:12:58 +0200
Subject: [PATCH 017/160] Rename output and start arguments

---
 mlblocks/mlpipeline.py | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index abbac922..91e44341 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -272,7 +272,7 @@ def _get_output(self, output_variable, context):
         else:
             return context
 
-    def fit(self, X=None, y=None, output=None, start_on=None, **kwargs):
+    def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
         """Fit the blocks of this pipeline.
 
         Sequentially call the `fit` and the `produce` methods of each block,
@@ -288,7 +288,7 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs):
             X: Fit Data, which the pipeline will learn from.
             y: Fit Data labels, which the pipeline will use to learn how to
                behave.
-            output (str or int): Output specification, which can be a string or an integer.
+            output_ (str or int): Output specification, which can be a string or an integer.
                 If an integer is given, it is interpreted as the block number, and the whole
                 context after running the specified block will be returned.
                 If a string is given, it is expected to be the name of one block, including
@@ -297,7 +297,7 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs):
                 block name and the variable name. If the variable name is given, this will be
                 extracted from the context and returned. Otherwise, the whole context will
                 be returned.
-            start_on (str or int): Block index or block name to start processing from. The
+            start_ (str or int): Block index or block name to start processing from. The
                 value can either be an integer, which will be interpreted as a block index,
                 or the name of a block, including the conter number at the end.
                 If given, the execution of the pipeline will start on the specified block,
@@ -321,16 +321,16 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs):
         }
         context.update(kwargs)
 
-        output_block, output_variable = self._get_output_spec(output)
+        output_block, output_variable = self._get_output_spec(output_)
         last_block_name = self._get_block_name(-1)
 
-        if isinstance(start_on, int):
-            start_on = self._get_block_name(start_on)
+        if isinstance(start_, int):
+            start_ = self._get_block_name(start_)
 
         for block_name, block in self.blocks.items():
-            if block_name == start_on:
-                start_on = False
-            elif start_on:
+            if block_name == start_:
+                start_ = False
+            elif start_:
                 LOGGER.debug("Skipping block %s fit", block_name)
                 continue
 
@@ -357,7 +357,7 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs):
             if block_name == output_block:
                 return self._get_output(output_variable, context)
 
-    def predict(self, X=None, output='y', start_on=None, **kwargs):
+    def predict(self, X=None, output_='y', start_=None, **kwargs):
         """Produce predictions using the blocks of this pipeline.
 
         Sequentially call the `produce` method of each block, capturing the
@@ -370,7 +370,7 @@ def predict(self, X=None, output='y', start_on=None, **kwargs):
 
         Args:
             X: Data which the pipeline will use to make predictions.
-            output (str or int): Output specification, which can be a string or an integer.
+            output_ (str or int): Output specification, which can be a string or an integer.
                 If an integer is given, it is interpreted as the block number, and the whole
                 context after running the specified block will be returned.
                 If a string is given, it is expected to be the name of one block, including
@@ -379,7 +379,7 @@ def predict(self, X=None, output='y', start_on=None, **kwargs):
                 block name and the variable name. If the variable name is given, this will be
                 extracted from the context and returned. Otherwise, the whole context will
                 be returned.
-            start_on (str or int): Block index or block name to start processing from. The
+            start_ (str or int): Block index or block name to start processing from. The
                 value can either be an integer, which will be interpreted as a block index,
                 or the name of a block, including the conter number at the end.
                 If given, the execution of the pipeline will start on the specified block,
@@ -402,15 +402,15 @@ def predict(self, X=None, output='y', start_on=None, **kwargs):
         }
         context.update(kwargs)
 
-        output_block, output_variable = self._get_output_spec(output)
+        output_block, output_variable = self._get_output_spec(output_)
 
-        if isinstance(start_on, int):
-            start_on = self._get_block_name(start_on)
+        if isinstance(start_, int):
+            start_ = self._get_block_name(start_)
 
         for block_name, block in self.blocks.items():
-            if block_name == start_on:
-                start_on = False
-            elif start_on:
+            if block_name == start_:
+                start_ = False
+            elif start_:
                 LOGGER.debug("Skipping block %s produce", block_name)
                 continue
 
@@ -428,9 +428,9 @@ def predict(self, X=None, output='y', start_on=None, **kwargs):
                 LOGGER.exception("Exception caught producing MLBlock %s", block_name)
                 raise
 
-        if start_on:
+        if start_:
             # We skipped all the blocks up to the end
-            raise ValueError('Unknown block name: {}'.format(start_on))
+            raise ValueError('Unknown block name: {}'.format(start_))
 
     def to_dict(self):
         """Return all the details of this MLPipeline in a dict.

From 4607b3898aa9767774f872b936f2311492179746 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 7 May 2019 17:13:12 +0200
Subject: [PATCH 018/160] Add unit tests for partial outputs feature

---
 tests/features/test_partial_outputs.py | 133 +++++++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 tests/features/test_partial_outputs.py

diff --git a/tests/features/test_partial_outputs.py b/tests/features/test_partial_outputs.py
new file mode 100644
index 00000000..ce28d457
--- /dev/null
+++ b/tests/features/test_partial_outputs.py
@@ -0,0 +1,133 @@
+from unittest import TestCase
+from unittest.mock import Mock
+
+import numpy as np
+
+from mlblocks.datasets import load_iris
+from mlblocks.mlpipeline import MLPipeline
+
+
+def almost_equal(obj1, obj2):
+    if isinstance(obj1, dict):
+        if not isinstance(obj2, dict):
+            raise AssertionError("{} is not equal to {}".format(type(obj2), dict))
+
+        for key, value in obj1.items():
+            if key not in obj2:
+                raise AssertionError("{} not in {}".format(key, obj2))
+            almost_equal(value, obj2[key])
+
+    else:
+        np.testing.assert_almost_equal(obj1, obj2)
+
+
+class TestPartialOutputs(TestCase):
+    def setUp(self):
+        dataset = load_iris()
+
+        self.X_train, self.X_test, self.y_train, self.y_test = dataset.get_splits(1)
+
+    def test_fit_output(self):
+
+        # Setup variables
+        primitives = [
+            'sklearn.preprocessing.StandardScaler',
+            'sklearn.linear_model.LogisticRegression'
+        ]
+        pipeline = MLPipeline(primitives)
+
+        int_block = 0
+        invalid_int = 10
+        str_block = 'sklearn.preprocessing.StandardScaler#1'
+        invalid_block = 'InvalidBlockName'
+        str_block_variable = 'sklearn.preprocessing.StandardScaler#1.y'
+        invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid'
+
+        # Run
+        int_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=int_block)
+        str_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=str_block)
+        str_out_variable = pipeline.fit(self.X_train[0:5], self.y_train[0:5],
+                                        output_=str_block_variable)
+        no_output = pipeline.fit(self.X_train, self.y_train)
+
+        # Assert successful calls
+        X = np.array([
+            [0.71269665, -1.45152899, 0.55344946, 0.31740553],
+            [0.26726124, 1.23648766, -1.1557327, -1.0932857],
+            [-1.95991577, 0.967686, -1.1557327, -1.0932857],
+            [0.71269665, -0.645124, 0.39067021, 0.31740553],
+            [0.26726124, -0.10752067, 1.36734573, 1.55176035]
+        ])
+        y = np.array([1, 0, 0, 1, 2])
+        context = {
+            'X': X,
+            'y': y
+        }
+        almost_equal(context, int_out)
+        almost_equal(context, str_out)
+
+        almost_equal(y, str_out_variable)
+
+        assert no_output is None
+
+        # Run asserting exceptions
+        with self.assertRaises(IndexError):
+            pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_int)
+
+        with self.assertRaises(ValueError):
+            pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_block)
+
+        with self.assertRaises(ValueError):
+            pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_variable)
+
+    def test_fit_start(self):
+        # Setup variables
+        primitives = [
+            'sklearn.preprocessing.StandardScaler',
+            'sklearn.linear_model.LogisticRegression'
+        ]
+        pipeline = MLPipeline(primitives)
+
+        # Mock the first block
+        block_mock = Mock()
+        pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock
+
+        # Run first block
+        context = {
+            'X': self.X_train,
+            'y': self.y_train
+        }
+        int_start = 1
+        str_start = 'sklearn.linear_model.LogisticRegression#1'
+
+        pipeline.fit(start_=int_start, **context)
+        pipeline.fit(start_=str_start, **context)
+
+        # Assert that mock has not been called
+        block_mock.fit.assert_not_called()
+
+    def test_predict_start(self):
+        # Setup variables
+        primitives = [
+            'sklearn.preprocessing.StandardScaler',
+            'sklearn.linear_model.LogisticRegression'
+        ]
+        pipeline = MLPipeline(primitives)
+        pipeline.fit(self.X_train, self.y_train)
+
+        # Mock the first block
+        block_mock = Mock()
+        pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock
+
+        # Run first block
+        context = {
+            'X': self.X_train,
+        }
+        int_start = 1
+        str_start = 'sklearn.linear_model.LogisticRegression#1'
+
+        pipeline.predict(start_=int_start, **context)
+        pipeline.predict(start_=str_start, **context)
+
+        # Assert that mock has not been called
+        block_mock.predict.assert_not_called()

From 980794b67165e286d49cb81cf742ea44fd760365 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 9 May 2019 15:14:23 +0200
Subject: [PATCH 019/160] Improve docstrings and add toc in autogenerated API
 reference

---
 Makefile               |   5 +
 docs/conf.py           |   9 +-
 mlblocks/datasets.py   |  12 +-
 mlblocks/mlblock.py    |  79 +++++++------
 mlblocks/mlpipeline.py | 256 +++++++++++++++++++++++++++--------------
 mlblocks/primitives.py |   3 +-
 setup.cfg              |   6 +
 setup.py               |   4 +
 8 files changed, 234 insertions(+), 140 deletions(-)

diff --git a/Makefile b/Makefile
index c2d2aaa4..6266033f 100644
--- a/Makefile
+++ b/Makefile
@@ -98,6 +98,11 @@ fix-lint: ## fix lint issues using autoflake, autopep8, and isort
 	autopep8 --in-place --recursive --aggressive tests
 	isort --apply --atomic --recursive tests
 
+.PHONY: lint-docs
+lint-docs: ## check docs formatting with doc8 and pydocstyle
+	doc8 mlblocks/
+	pydocstyle mlblocks/
+
 
 # TEST TARGETS
 
diff --git a/docs/conf.py b/docs/conf.py
index 9b4595ec..95653914 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -39,8 +39,13 @@
     'sphinx.ext.graphviz',
     'IPython.sphinxext.ipython_console_highlighting',
     'IPython.sphinxext.ipython_directive',
+    'autodocsumm',
 ]
 
+autodoc_default_options = {
+    'autosummary': True,
+}
+
 ipython_execlines = ["import pandas as pd", "pd.set_option('display.width', 1000000)"]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -50,10 +55,6 @@
 # You can specify multiple suffix as a list of string:
 source_suffix = ['.rst', '.md', '.ipynb']
 
-# source_parsers = {
-#     '.md': CommonMarkParser,
-# }
-
 # The master toctree document.
 master_doc = 'index'
 
diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py
index fb32df9c..0c69afda 100644
--- a/mlblocks/datasets.py
+++ b/mlblocks/datasets.py
@@ -100,6 +100,7 @@ class Dataset():
         **kwargs: Any additional keyword argument passed on initialization will be made
             available as instance attributes.
     """
+
     def __init__(self, description, data, target, score, shuffle=True, stratify=False, **kwargs):
 
         self.name = description.splitlines()[0]
@@ -115,10 +116,10 @@ def __init__(self, description, data, target, score, shuffle=True, stratify=Fals
         self.__dict__.update(kwargs)
 
     def score(self, *args, **kwargs):
-        """Scoring function for this dataset.
+        r"""Scoring function for this dataset.
 
         Args:
-            \\*args, \\*\\*kwargs: Any given arguments and keyword arguments will be
+            \*args, \*\*kwargs: Any given arguments and keyword arguments will be
             directly passed to the given scoring function.
 
         Returns:
@@ -315,7 +316,6 @@ def load_dic28():
     There exist 52,652 words (vertices in a network) having 2 up to 8 characters
     in the dictionary. The obtained network has 89038 edges.
     """
-
     dataset_path = _load('dic28')
 
     X = _load_csv(dataset_path, 'data')
@@ -344,7 +344,6 @@ def load_nomination():
     Data consists of one graph whose nodes contain two attributes, attr1 and attr2.
     Associated with each node is a label that has to be learned and predicted.
     """
-
     dataset_path = _load('nomination')
 
     X = _load_csv(dataset_path, 'data')
@@ -363,7 +362,6 @@ def load_amazon():
     co-purchased with product j, the graph contains an undirected edge from i to j.
     Each product category provided by Amazon defines each ground-truth community.
     """
-
     dataset_path = _load('amazon')
 
     X = _load_csv(dataset_path, 'data')
@@ -383,7 +381,6 @@ def load_jester():
     source: "University of California Berkeley, CA"
     sourceURI: "http://eigentaste.berkeley.edu/dataset/"
     """
-
     dataset_path = _load('jester')
 
     X = _load_csv(dataset_path, 'data')
@@ -393,7 +390,7 @@ def load_jester():
 
 
 def load_wikiqa():
-    """A Challenge Dataset for Open-Domain Question Answering.
+    """Challenge Dataset for Open-Domain Question Answering.
 
     WikiQA dataset is a publicly available set of question and sentence (QS) pairs,
     collected and annotated for research on open-domain question answering.
@@ -401,7 +398,6 @@ def load_wikiqa():
     source: "Microsoft"
     sourceURI: "https://www.microsoft.com/en-us/research/publication/wikiqa-a-challenge-dataset-for-open-domain-question-answering/#"
     """  # noqa
-
     dataset_path = _load('wikiqa')
 
     data = _load_csv(dataset_path, 'data', set_index=True)
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index a5cdb6a4..c3878e68 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -25,32 +25,34 @@ class MLBlock():
     as wrapping them and providing a common interface to run them.
 
     Attributes:
-        name (str): Name given to this MLBlock.
-        primitive (object): the actual function or instance which this MLBlock
-                            wraps.
-        fit_args (dict): specification of the arguments expected by the `fit`
-                         method.
-        fit_method (str): name of the primitive method to call on `fit`.
-                          `None` if the primitive is a function.
-        produce_args (dict): specification of the arguments expected by the
-                             `predict` method.
-        produce_output (dict): specification of the outputs of the `produce`
-                               method.
-        produce_method (str): name of the primitive method to call on
-                              `produce`. `None` if the primitive is a function.
+        name (str):
+            Name given to this MLBlock.
+        primitive (object):
+            the actual function or instance which this MLBlock wraps.
+        fit_args (dict):
+            specification of the arguments expected by the `fit` method.
+        fit_method (str):
+            name of the primitive method to call on `fit`. `None` if the primitive is a function.
+        produce_args (dict):
+            specification of the arguments expected by the `predict` method.
+        produce_output (dict):
+            specification of the outputs of the `produce` method.
+        produce_method (str):
+            name of the primitive method to call on `produce`. `None` if the primitive is a
+            function.
 
     Args:
-        name (str): Name given to this MLBlock.
-        **kwargs: Any additional arguments that will be used as
-                  hyperparameters or passed to the `fit` or `produce`
-                  methods.
+        name (str):
+            Name given to this MLBlock.
+        **kwargs:
+            Any additional arguments that will be used as hyperparameters or passed to the
+            `fit` or `produce` methods.
 
     Raises:
-        TypeError: A `TypeError` is raised if a required argument is not
-                   found within the `kwargs` or if an unexpected
-                   argument has been given.
-    """
-    # pylint: disable=too-many-instance-attributes
+        TypeError:
+            A `TypeError` is raised if a required argument is not found within the `kwargs`
+            or if an unexpected argument has been given.
+    """  # pylint: disable=too-many-instance-attributes
 
     def _extract_params(self, kwargs, hyperparameters):
         """Extract init, fit and produce params from kwargs.
@@ -63,16 +65,16 @@ def _extract_params(self, kwargs, hyperparameters):
         have been given and that nothing unexpected exists in the input.
 
         Args:
-            kwargs (dict): dict containing the Keyword arguments that have
-                           been passed to the `__init__` method upon
-                           initialization.
-            hyperparameters (dict): hyperparameters dictionary, as found in
-                                    the JSON annotation.
+            kwargs (dict):
+                dict containing the Keyword arguments that have been passed to the `__init__`
+                method upon initialization.
+            hyperparameters (dict):
+                hyperparameters dictionary, as found in the JSON annotation.
 
         Raises:
-            TypeError: A `TypeError` is raised if a required argument is not
-                       found in the `kwargs` dict, or if an unexpected
-                       argument has been given.
+            TypeError:
+                A `TypeError` is raised if a required argument is not found in the `kwargs` dict,
+                or if an unexpected argument has been given.
         """
         init_params = dict()
         fit_params = dict()
@@ -138,7 +140,6 @@ def _get_tunable(cls, hyperparameters, init_params):
         return tunable
 
     def __init__(self, name, **kwargs):
-
         self.name = name
 
         metadata = load_primitive(name)
@@ -174,6 +175,7 @@ def __init__(self, name, **kwargs):
         self.set_hyperparameters(default)
 
     def __str__(self):
+        """Return a string that represents this block."""
         return 'MLBlock - {}'.format(self.name)
 
     def get_tunable_hyperparameters(self):
@@ -210,9 +212,9 @@ def set_hyperparameters(self, hyperparameters):
         If necessary, a new instance of the primitive is created.
 
         Args:
-            hyperparameters (dict): Dictionary containing as keys the name
-                                    of the hyperparameters and as values
-                                    the values to be used.
+            hyperparameters (dict):
+                Dictionary containing as keys the name of the hyperparameters and as
+                values the values to be used.
         """
         self._hyperparameters.update(hyperparameters)
 
@@ -233,12 +235,13 @@ def fit(self, **kwargs):
         the primitive is a simple function, this will be a noop.
 
         Args:
-            **kwargs: Any given keyword argument will be directly passed
-                      to the primitive fit method.
+            **kwargs:
+                Any given keyword argument will be directly passed to the primitive fit method.
 
         Raises:
-            TypeError: A `TypeError` might be raised if any argument not
-                       expected by the primitive fit method is given.
+            TypeError:
+                A `TypeError` might be raised if any argument not expected by the primitive fit
+                method is given.
         """
         if self.fit_method is not None:
             fit_args = self._fit_params.copy()
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 91e44341..eddb442e 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -34,38 +34,35 @@ class MLPipeline():
     results, which will be returned as the prediction of the pipeline.
 
     Attributes:
-        primitives (list): List of the names of the primitives that compose
-                           this pipeline.
-        blocks (list): OrderedDict of the block names and the corresponding
-                       MLBlock instances.
-        init_params (dict): init_params dictionary, as given when the instance
-                            was created.
-        input_names (dict): input_names dictionary, as given when the instance
-                            was created.
-        output_names (dict): output_names dictionary, as given when the instance
-                             was created.
+        primitives (list):
+            List of the names of the primitives that compose this pipeline.
+        blocks (list):
+            OrderedDict of the block names and the corresponding MLBlock instances.
+        init_params (dict):
+            init_params dictionary, as given when the instance was created.
+        input_names (dict):
+            input_names dictionary, as given when the instance was created.
+        output_names (dict):
+            output_names dictionary, as given when the instance was created.
 
     Args:
-        primitives (list): List with the names of the primitives that will
-                           compose this pipeline.
-        init_params (dict): dictionary containing initialization arguments to
-                            be passed when creating the MLBlocks instances.
-                            The dictionary keys must be the corresponding
-                            primitive names and the values must be another
-                            dictionary that will be passed as `**kargs` to the
-                            MLBlock instance.
-        input_names (dict): dictionary that maps input variable names with the
-                            actual names expected by each primitive. This
-                            allows reusing the same input argument for multiple
-                            primitives that name it differently, as well as
-                            passing different values to primitives that expect
-                            arguments named similary.
-        output_names (dict): dictionary that maps output variable names with
-                             the name these variables will be given when stored
-                             in the context dictionary. This allows storing
-                             the output of different primitives in different
-                             variables, even if the primitive output name is
-                             the same one.
+        primitives (list):
+            List with the names of the primitives that will compose this pipeline.
+        init_params (dict):
+            dictionary containing initialization arguments to be passed when creating the
+            MLBlocks instances. The dictionary keys must be the corresponding primitive names
+            and the values must be another dictionary that will be passed as `**kargs` to the
+            MLBlock instance.
+        input_names (dict):
+            dictionary that maps input variable names with the actual names expected by each
+            primitive. This allows reusing the same input argument for multiple primitives that
+            name it differently, as well as passing different values to primitives that expect
+            arguments named similary.
+        output_names (dict):
+            dictionary that maps output variable names with the name these variables will be
+            given when stored in the context dictionary. This allows storing the output of
+            different primitives in different variables, even if the primitive output name is
+            the same one.
     """
 
     def _get_tunable_hyperparameters(self):
@@ -133,9 +130,9 @@ def set_hyperparameters(self, hyperparameters):
         """Set new hyperparameter values for some blocks.
 
         Args:
-            hyperparameters (dict): A dictionary containing the block names as
-                                    keys and the new hyperparameters dictionary
-                                    as values.
+            hyperparameters (dict):
+                A dictionary containing the block names as keys and the new hyperparameters
+                dictionary as values.
         """
         for block_name, block_hyperparams in hyperparameters.items():
             self.blocks[block_name].set_hyperparameters(block_hyperparams)
@@ -148,11 +145,12 @@ def _get_block_args(self, block_name, block_args, context):
         was created.
 
         Args:
-            block_name (str): Name of this block. Used to find the corresponding
-                              input_names.
-            block_args (list): list of method argument specifications from the
-                               primitive.
-            context (dict): current context dictionary.
+            block_name (str):
+                Name of this block. Used to find the corresponding input_names.
+            block_args (list):
+                list of method argument specifications from the primitive.
+            context (dict):
+                current context dictionary.
 
         Returns:
             dict:
@@ -213,22 +211,40 @@ def _get_block_name(self, index):
         return list(self.blocks.keys())[index]
 
     def _get_output_spec(self, output):
-        """Parsre the output specification and get a block name and a variable name.
+        """Parse the output specification and get a block name and a variable name.
 
         The output specification can be of two types: int and str.
 
         If it is an integer, it is interpreted as a block index, and the variable name
         is considered to be ``None``, which means that the whole context will be returned.
 
-        If it is a string, it is interpreted as the block name, and it has to match a block
-        name exactly, including its hash and counter number ``#n``. Optionally, a variable
-        name can be passed at the end using a ``'.'`` as a separator.
-        In this case, the format of the string is `{block_name}.{variable_name}`. Note
-        that the block name can also contain dots, so only the leftmost dot will be
-        considered, and only if the complete string does not match exactly a block name.
+        If it is a string, it can be interpreted in three ways:
+
+            * **block name**: If the string matches a block name exactly, including
+            its hash and counter number ``#n`` at the end, the whole context will be
+            returned after that block is produced.
+            * **variable_name**: If the string does not match any block name and does
+            not contain any dot characted, ``'.'``, it will be considered a variable
+            name. In this case, the indicated variable will be extracted from the
+            context and returned after the last block has been produced.
+            * **block_name + variable_name**: If the complete string does not match a
+            block name but it contains at least one dot, ``'.'``, it will be split
+            in two parts on the last dot. If the first part of the string matches a
+            block name exactly, the second part of the string will be considered a
+            variable name, assuming the format ``{block_name}.{variable_name}``, and
+            the indicated variable will be extracted from the context and returned
+            after the block has been produced. Otherwise, if the extracted
+            ``block_name`` does not match a block name exactly, a ``ValueError``
+            will be raised.
 
         Args:
-            output (str or int): Output specification as either a string or an integer.
+            output (str or int):
+                Output specification as either a string or an integer.
+
+        Raises:
+            ValueError:
+                If the output string contains dots but it does not match a block
+                name exactly
 
         Returns:
             tuple:
@@ -239,15 +255,21 @@ def _get_output_spec(self, output):
                         It can be ``None``, which means that the whole context is to be
                         returned.
         """
+        # If None is given, both block and varialbe are None
         if output is None:
             return None, None
 
+        # If an int is given, it is a block index and there is no variable
         if isinstance(output, int):
             output = self._get_block_name(output)
+            return output, None
 
+        # If the string matches a block name, there is no variable
         if output in self.blocks:
             return output, None
 
+        # If there is at least one dot in the output, but it did not match
+        # a block name, it is considered to be {block_name}.{variable_name}
         if '.' in output:
             output_block, output_variable = output.rsplit('.', 1)
             if output_block not in self.blocks:
@@ -255,6 +277,9 @@ def _get_output_spec(self, output):
 
             return output_block, output_variable
 
+        # If the given string is not a block name and it has no dots,
+        # it is considered to be a variable name to be extracted
+        # from the context after the last block has been produced
         last_block_name = self._get_block_name(-1)
         return last_block_name, output
 
@@ -285,25 +310,48 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
         `produce` calls will be taken.
 
         Args:
-            X: Fit Data, which the pipeline will learn from.
-            y: Fit Data labels, which the pipeline will use to learn how to
-               behave.
-            output_ (str or int): Output specification, which can be a string or an integer.
-                If an integer is given, it is interpreted as the block number, and the whole
-                context after running the specified block will be returned.
-                If a string is given, it is expected to be the name of one block, including
-                its counter number at the end. Optionally, a variable name can be included
-                at the end after the counter number using a ``'.'`` as a separator between the
-                block name and the variable name. If the variable name is given, this will be
-                extracted from the context and returned. Otherwise, the whole context will
-                be returned.
-            start_ (str or int): Block index or block name to start processing from. The
+            X:
+                Fit Data, which the pipeline will learn from.
+
+            y:
+                Fit Data labels, which the pipeline will use to learn how to
+                behave.
+
+            output_ (str or int or None):
+                Output specification, which can be a string or an integer or None.
+
+                    * If it is None (default), nothing will be returned
+                    * If an integer is given, it is interpreted as the block number, and the whole
+                      context after running the specified block will be returned.
+                    * If it is a string, it can be interpreted in three ways:
+
+                        * **block name**: If the string matches a block name exactly, including
+                          its hash and counter number ``#n`` at the end, the whole context will be
+                          returned after that block is produced.
+                        * **variable_name**: If the string does not match any block name and does
+                          not contain any dot characted, ``'.'``, it will be considered a variable
+                          name. In this case, the indicated variable will be extracted from the
+                          context and returned after the last block has been produced.
+                        * **block_name + variable_name**: If the complete string does not match a
+                          block name but it contains at least one dot, ``'.'``, it will be split
+                          in two parts on the last dot. If the first part of the string matches a
+                          block name exactly, the second part of the string will be considered a
+                          variable name, assuming the format ``{block_name}.{variable_name}``, and
+                          the indicated variable will be extracted from the context and returned
+                          after the block has been produced. Otherwise, if the extracted
+                          ``block_name`` does not match a block name exactly, a ``ValueError``
+                          will be raised.
+
+            start_ (str or int or None):
+                Block index or block name to start processing from. The
                 value can either be an integer, which will be interpreted as a block index,
                 or the name of a block, including the conter number at the end.
                 If given, the execution of the pipeline will start on the specified block,
                 and all the blocks before that one will be skipped.
-            **kwargs: Any additional keyword arguments will be directly added
-                      to the context dictionary and available for the blocks.
+
+            **kwargs:
+                Any additional keyword arguments will be directly added
+                to the context dictionary and available for the blocks.
 
         Returns:
             None or dict or object:
@@ -328,11 +376,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
             start_ = self._get_block_name(start_)
 
         for block_name, block in self.blocks.items():
-            if block_name == start_:
-                start_ = False
-            elif start_:
-                LOGGER.debug("Skipping block %s fit", block_name)
-                continue
+            if start_:
+                if block_name == start_:
+                    start_ = False
+                else:
+                    LOGGER.debug("Skipping block %s fit", block_name)
+                    continue
 
             LOGGER.debug("Fitting block %s", block_name)
             try:
@@ -357,7 +406,11 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
             if block_name == output_block:
                 return self._get_output(output_variable, context)
 
-    def predict(self, X=None, output_='y', start_=None, **kwargs):
+        if start_:
+            # We skipped all the blocks up to the end
+            raise ValueError('Unknown block name: {}'.format(start_))
+
+    def predict(self, X=None, output_=None, start_=None, **kwargs):
         """Produce predictions using the blocks of this pipeline.
 
         Sequentially call the `produce` method of each block, capturing the
@@ -369,23 +422,43 @@ def predict(self, X=None, output_='y', start_=None, **kwargs):
         will be taken.
 
         Args:
-            X: Data which the pipeline will use to make predictions.
-            output_ (str or int): Output specification, which can be a string or an integer.
-                If an integer is given, it is interpreted as the block number, and the whole
-                context after running the specified block will be returned.
-                If a string is given, it is expected to be the name of one block, including
-                its counter number at the end. Optionally, a variable name can be included
-                at the end after the counter number using a ``'.'`` as a separator between the
-                block name and the variable name. If the variable name is given, this will be
-                extracted from the context and returned. Otherwise, the whole context will
-                be returned.
-            start_ (str or int): Block index or block name to start processing from. The
+            X:
+                Data which the pipeline will use to make predictions.
+
+            output_ (str or int or None):
+                Output specification, which can be a string or an integer or None.
+                    * If it is None (default), the output of the last block will be returned.
+                    * If an integer is given, it is interpreted as the block number, and the whole
+                      context after running the specified block will be returned.
+                    * If it is a string, it can be interpreted in three ways:
+
+                        * **block name**: If the string matches a block name exactly, including
+                          its hash and counter number ``#n`` at the end, the whole context will be
+                          returned after that block is produced.
+                        * **variable_name**: If the string does not match any block name and does
+                          not contain any dot characted, ``'.'``, it will be considered a variable
+                          name. In this case, the indicated variable will be extracted from the
+                          context and returned after the last block has been produced.
+                        * **block_name + variable_name**: If the complete string does not match a
+                          block name but it contains at least one dot, ``'.'``, it will be split
+                          in two parts on the last dot. If the first part of the string matches a
+                          block name exactly, the second part of the string will be considered a
+                          variable name, assuming the format ``{block_name}.{variable_name}``, and
+                          the indicated variable will be extracted from the context and returned
+                          after the block has been produced. Otherwise, if the extracted
+                          ``block_name`` does not match a block name exactly, a ``ValueError``
+                          will be raised.
+
+            start_ (str or int or None):
+                Block index or block name to start processing from. The
                 value can either be an integer, which will be interpreted as a block index,
                 or the name of a block, including the conter number at the end.
                 If given, the execution of the pipeline will start on the specified block,
                 and all the blocks before that one will be skipped.
-            **kwargs: Any additional keyword arguments will be directly added
-                      to the context dictionary and available for the blocks.
+
+            **kwargs:
+                Any additional keyword arguments will be directly added
+                to the context dictionary and available for the blocks.
 
         Returns:
             None or dict or object:
@@ -408,11 +481,12 @@ def predict(self, X=None, output_='y', start_=None, **kwargs):
             start_ = self._get_block_name(start_)
 
         for block_name, block in self.blocks.items():
-            if block_name == start_:
-                start_ = False
-            elif start_:
-                LOGGER.debug("Skipping block %s produce", block_name)
-                continue
+            if start_:
+                if block_name == start_:
+                    start_ = False
+                else:
+                    LOGGER.debug("Skipping block %s produce", block_name)
+                    continue
 
             LOGGER.debug("Producing block %s", block_name)
             try:
@@ -432,6 +506,9 @@ def predict(self, X=None, output_='y', start_=None, **kwargs):
             # We skipped all the blocks up to the end
             raise ValueError('Unknown block name: {}'.format(start_))
 
+        if output_ is None:
+            return outputs
+
     def to_dict(self):
         """Return all the details of this MLPipeline in a dict.
 
@@ -487,7 +564,8 @@ def save(self, path):
         The content of the JSON file is the dict returned by the `to_dict` method.
 
         Args:
-            path (str): Path to the JSON file to write.
+            path (str):
+                Path to the JSON file to write.
         """
         with open(path, 'w') as out_file:
             json.dump(self.to_dict(), out_file, indent=4)
@@ -499,7 +577,8 @@ def from_dict(cls, metadata):
         The dict structure is the same as the one created by the `to_dict` method.
 
         Args:
-            metadata (dict): Dictionary containing the pipeline specification.
+            metadata (dict):
+                Dictionary containing the pipeline specification.
 
         Returns:
             MLPipeline:
@@ -531,7 +610,8 @@ def load(cls, path):
         The JSON file format is the same as the one created by the `to_dict` method.
 
         Args:
-            path (str): Path of the JSON file to load.
+            path (str):
+                Path of the JSON file to load.
 
         Returns:
             MLPipeline:
diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py
index 9bca6a5d..f2300f67 100644
--- a/mlblocks/primitives.py
+++ b/mlblocks/primitives.py
@@ -37,6 +37,7 @@ def add_primitives_path(path):
 
     Raises:
         ValueError: A `ValueError` will be raised if the path is not valid.
+
     """
     if path not in _PRIMITIVES_PATHS:
         if not os.path.isdir(path):
@@ -68,7 +69,6 @@ def get_primitives_paths():
         list:
             The list of folders.
     """
-
     primitives_paths = list()
     entry_points = pkg_resources.iter_entry_points('mlprimitives')
     for entry_point in entry_points:
@@ -99,7 +99,6 @@ def load_primitive(name):
         ValueError: A `ValueError` will be raised if the primitive cannot be
                     found.
     """
-
     for base_path in get_primitives_paths():
         parts = name.split('.')
         number_of_parts = len(parts)
diff --git a/setup.cfg b/setup.cfg
index 62ced521..17244565 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -45,3 +45,9 @@ collect_ignore = ['setup.py']
 
 [tool:pylint]
 good-names = X,y
+
+[doc8]
+max-line-length = 99
+
+[pydocstyle]
+add-ignore = D403,D413,D105,D107
diff --git a/setup.py b/setup.py
index f6991ab1..c73eb0a6 100644
--- a/setup.py
+++ b/setup.py
@@ -59,6 +59,10 @@
     # Advanced testing
     'tox>=2.9.1',
     'coverage>=4.5.1',
+
+    # Documentation style
+    'doc8==0.8.0',
+    'pydocstyle==3.0.0'
 ]
 
 

From 711201650e50e7ef0c3861347ac89abfa1a5c77d Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 9 May 2019 15:42:10 +0200
Subject: [PATCH 020/160] Add missing dependency

---
 setup.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index c73eb0a6..f355be93 100644
--- a/setup.py
+++ b/setup.py
@@ -40,9 +40,10 @@
     'm2r>=0.2.0',
     'Sphinx>=1.7.1',
     'sphinx_rtd_theme>=0.2.4',
-    'graphviz==0.9',
-    'ipython==6.5.0',
-    'matplotlib==2.2.3',
+    'graphviz>=0.9',
+    'ipython>=6.5.0',
+    'matplotlib>=2.2.3',
+    'autodocsumm>=0.1.10',
 
     # style check
     'flake8>=3.5.0',
@@ -61,8 +62,8 @@
     'coverage>=4.5.1',
 
     # Documentation style
-    'doc8==0.8.0',
-    'pydocstyle==3.0.0'
+    'doc8>=0.8.0',
+    'pydocstyle>=3.0.0'
 ]
 
 

From b26e527117cc45f94ed87c558e528a9a3276ff6f Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 16 May 2019 19:50:47 +0200
Subject: [PATCH 021/160] Move default and keyword arguments logic to MLBlock

---
 mlblocks/mlblock.py    | 55 ++++++++++++++++++++++++++++++++++++------
 mlblocks/mlpipeline.py | 14 +----------
 setup.py               |  3 ++-
 3 files changed, 50 insertions(+), 22 deletions(-)

diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index c3878e68..80f5baa2 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -222,6 +222,43 @@ def set_hyperparameters(self, hyperparameters):
             LOGGER.debug('Creating a new primitive instance for %s', self.name)
             self.instance = self.primitive(**self._hyperparameters)
 
+    def _get_method_kwargs(self, kwargs, method_args):
+        """Prepare the kwargs for the method.
+
+        The kwargs dict will be altered according to the method_kwargs
+        specification to make them ready for the primitive method to
+        accept them.
+
+        Args:
+            kwargs (dict):
+                keyword arguments that have been passed to the block method.
+            method_args (list):
+                method arguments as specified in the JSON annotation.
+
+        Returns:
+            dict:
+                A dictionary containing the argument names and values to pass
+                to the primitive method.
+        """
+
+        method_kwargs = dict()
+        for arg in method_args:
+            name = arg['name']
+            keyword = arg.get('keyword', name)
+
+            if name in kwargs:
+                value = kwargs[name]
+
+            elif 'default' in arg:
+                value = arg['default']
+
+            else:
+                raise TypeError("missing expected argument '{}'".format(name))
+
+            method_kwargs[keyword] = value
+
+        return method_kwargs
+
     def fit(self, **kwargs):
         """Call the fit method of the primitive.
 
@@ -244,9 +281,10 @@ def fit(self, **kwargs):
                 method is given.
         """
         if self.fit_method is not None:
-            fit_args = self._fit_params.copy()
-            fit_args.update(kwargs)
-            getattr(self.instance, self.fit_method)(**fit_args)
+            fit_kwargs = self._fit_params.copy()
+            fit_kwargs.update(kwargs)
+            fit_kwargs = self._get_method_kwargs(fit_kwargs, self.fit_args)
+            getattr(self.instance, self.fit_method)(**fit_kwargs)
 
     def produce(self, **kwargs):
         """Call the primitive function, or the predict method of the primitive.
@@ -262,10 +300,11 @@ def produce(self, **kwargs):
             The output of the call to the primitive function or primitive
             produce method.
         """
-        produce_args = self._produce_params.copy()
-        produce_args.update(kwargs)
+        produce_kwargs = self._produce_params.copy()
+        produce_kwargs.update(kwargs)
+        produce_kwargs = self._get_method_kwargs(produce_kwargs, self.produce_args)
         if self._class:
-            return getattr(self.instance, self.produce_method)(**produce_args)
+            return getattr(self.instance, self.produce_method)(**produce_kwargs)
 
-        produce_args.update(self._hyperparameters)
-        return self.primitive(**produce_args)
+        produce_kwargs.update(self._hyperparameters)
+        return self.primitive(**produce_kwargs)
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index eddb442e..9a0a109e 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -164,22 +164,10 @@ def _get_block_args(self, block_name, block_args, context):
         kwargs = dict()
         for arg in block_args:
             name = arg['name']
-            keyword = arg.get('keyword', name)
             variable = input_names.get(name, name)
 
             if variable in context:
-                value = context[variable]
-
-            elif 'default' in arg:
-                value = arg['default']
-
-            else:
-                raise TypeError(
-                    "Expected argument '{}.{}' not found in context"
-                    .format(block_name, variable)
-                )
-
-            kwargs[keyword] = value
+                kwargs[name] = context[variable]
 
         return kwargs
 
diff --git a/setup.py b/setup.py
index f355be93..9fca4dfa 100644
--- a/setup.py
+++ b/setup.py
@@ -15,13 +15,14 @@
 
 
 install_requires = [
-    'mlprimitives>=0.1.3',
 ]
 
 
 tests_require = [
     'pytest>=3.4.2',
     'pytest-cov>=2.6.0',
+    'mlprimitives>=0.1.3,<0.2',
+    'urllib3>=1.20,<1.25'
 ]
 
 

From 00f11647ab11456f5e2d6761cd36170796ac5250 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 21 May 2019 12:16:33 -0400
Subject: [PATCH 022/160] Load pipelines by name

---
 mlblocks/__init__.py                          |   8 +-
 mlblocks/discovery.py                         | 263 ++++++++++++++++++
 mlblocks/mlblock.py                           |   2 +-
 mlblocks/mlpipeline.py                        |  70 ++++-
 mlblocks/primitives.py                        | 116 --------
 tests/features/test_pipeline_loading.py       | 106 +++++++
 .../{test_primitives.py => test_discovery.py} |  40 +--
 tests/test_mlpipeline.py                      |   6 +-
 8 files changed, 460 insertions(+), 151 deletions(-)
 create mode 100644 mlblocks/discovery.py
 delete mode 100644 mlblocks/primitives.py
 create mode 100644 tests/features/test_pipeline_loading.py
 rename tests/{test_primitives.py => test_discovery.py} (60%)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index cf326495..37199013 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -10,9 +10,11 @@
 * Documentation: https://HDI-Project.github.io/MLBlocks
 """
 
+from mlblocks.discovery import (
+    add_pipelines_path, add_primitives_path, get_pipelines_paths, get_primitives_paths,
+    load_pipeline, load_primitive)
 from mlblocks.mlblock import MLBlock
 from mlblocks.mlpipeline import MLPipeline
-from mlblocks.primitives import add_primitives_path, get_primitives_paths, load_primitive
 
 __author__ = 'MIT Data To AI Lab'
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
@@ -21,6 +23,6 @@
 __version__ = '0.3.1-dev'
 
 __all__ = [
-    'MLBlock', 'MLPipeline', 'add_primitives_path',
-    'get_primitives_paths', 'load_primitive'
+    'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
+    'get_pipelines_paths', 'get_primitives_paths', 'load_pipeline', 'load_primitive'
 ]
diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
new file mode 100644
index 00000000..78f12021
--- /dev/null
+++ b/mlblocks/discovery.py
@@ -0,0 +1,263 @@
+# -*- coding: utf-8 -*-
+
+"""
+Primitives and Pipelines discovery module.
+
+This module contains functions to load primitive and pipeline
+annotations, as well as to configure how MLBlocks finds the
+primitives and pipelines.
+"""
+
+import json
+import logging
+import os
+import sys
+
+import pkg_resources
+
+LOGGER = logging.getLogger(__name__)
+
+_PRIMITIVES_PATHS = [
+    os.path.join(os.getcwd(), 'mlprimitives'),
+    os.path.join(sys.prefix, 'mlprimitives'),
+    os.path.join(os.getcwd(), 'mlblocks_primitives'),    # legacy
+    os.path.join(sys.prefix, 'mlblocks_primitives'),    # legacy
+]
+_PIPELINES_PATHS = [
+    os.path.join(os.getcwd(), 'mlpipelines'),
+]
+
+
+def _add_lookup_path(path, paths):
+    """Add a new path to lookup.
+
+    The new path will be inserted in the first place of the list,
+    so any element found in this new folder will take precedence
+    over any other element with the same name that existed in the
+    system before.
+
+    Args:
+        path (str):
+            path to add
+
+    Raises:
+        ValueError:
+            A `ValueError` will be raised if the path is not valid.
+
+    """
+    if path not in paths:
+        if not os.path.isdir(path):
+            raise ValueError('Invalid path: {}'.format(path))
+
+        paths.insert(0, os.path.abspath(path))
+        return True
+
+
+def add_primitives_path(path):
+    """Add a new path to look for primitives.
+
+    The new path will be inserted in the first place of the list,
+    so any primitive found in this new folder will take precedence
+    over any other primitive with the same name that existed in the
+    system before.
+
+    Args:
+        path (str):
+            path to add
+
+    Raises:
+        ValueError:
+            A `ValueError` will be raised if the path is not valid.
+    """
+    added = _add_lookup_path(path, _PRIMITIVES_PATHS)
+    if added:
+        LOGGER.debug('New primitives path added: %s', path)
+
+
+def add_pipelines_path(path):
+    """Add a new path to look for pipelines.
+
+    The new path will be inserted in the first place of the list,
+    so any primitive found in this new folder will take precedence
+    over any other pipeline with the same name that existed in the
+    system before.
+
+    Args:
+        path (str):
+            path to add
+
+    Raises:
+        ValueError:
+            A `ValueError` will be raised if the path is not valid.
+    """
+    added = _add_lookup_path(path, _PIPELINES_PATHS)
+    if added:
+        LOGGER.debug('New pipelines path added: %s', path)
+
+
+def _get_lookup_paths(entry_point):
+    """Get the list of folders where elements will be looked for.
+
+    This list will include the value of any `entry_point` named `jsons_path` published under
+    the entry_point name.
+
+    An example of such an entry point would be::
+
+        entry_points = {
+            'mlprimitives': [
+                'jsons_path=some_module:SOME_VARIABLE'
+            ]
+        }
+
+    where the module `some_module` contains a variable such as::
+
+        SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
+
+    Args:
+        entry_point:
+            The name of the `entry_point` to look for.
+
+    Returns:
+        list:
+            The list of folders.
+    """
+    lookup_paths = list()
+    entry_points = pkg_resources.iter_entry_points(entry_point)
+    for entry_point in entry_points:
+        if entry_point.name == 'jsons_path':
+            path = entry_point.load()
+            lookup_paths.append(path)
+
+    return lookup_paths
+
+
+def get_primitives_paths():
+    """Get the list of folders where primitives will be looked for.
+
+    This list will include the value of any `entry_point` named `jsons_path` published under
+    the `mlprimitives` name.
+
+    An example of such an entry point would be::
+
+        entry_points = {
+            'mlprimitives': [
+                'jsons_path=some_module:SOME_VARIABLE'
+            ]
+        }
+
+    where the module `some_module` contains a variable such as::
+
+        SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
+
+    Returns:
+        list:
+            The list of folders.
+    """
+    return _PRIMITIVES_PATHS + _get_lookup_paths('mlprimitives')
+
+
+def get_pipelines_paths():
+    """Get the list of folders where pipelines will be looked for.
+
+    This list will include the value of any `entry_point` named `jsons_path` published under
+    the `mlpipelines` name.
+
+    An example of such an entry point would be::
+
+        entry_points = {
+            'mlpipelines': [
+                'jsons_path=some_module:SOME_VARIABLE'
+            ]
+        }
+
+    where the module `some_module` contains a variable such as::
+
+        SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
+
+    Returns:
+        list:
+            The list of folders.
+    """
+    return _PIPELINES_PATHS + _get_lookup_paths('mlpipelines')
+
+
+def _load(name, paths):
+    """Locate and load the JSON annotation in any of the given paths.
+
+    All the given paths will be scanned to find a JSON file with the given name,
+    and as soon as a JSON with the given name is found it is returned.
+
+    Args:
+        name (str):
+            name of the JSON to look for. The name should not contain the
+            `.json` extension, as it will be added dynamically.
+
+    Returns:
+        dict:
+            The content of the JSON annotation file loaded into a dict.
+    """
+    for base_path in paths:
+        parts = name.split('.')
+        number_of_parts = len(parts)
+
+        for folder_parts in range(number_of_parts):
+            folder = os.path.join(base_path, *parts[:folder_parts])
+            filename = '.'.join(parts[folder_parts:]) + '.json'
+            json_path = os.path.join(folder, filename)
+
+            if os.path.isfile(json_path):
+                with open(json_path, 'r') as json_file:
+                    LOGGER.debug('Loading %s from %s', name, json_path)
+                    return json.load(json_file)
+
+
+def load_primitive(name):
+    """Locate and load the primitive JSON annotation.
+
+    All the primitive paths will be scanned to find a JSON file with the given name,
+    and as soon as a JSON with the given name is found it is returned.
+
+    Args:
+        name (str):
+            name of the JSON to look for. The name should not contain the
+            `.json` extension, as it will be added dynamically.
+
+    Returns:
+        dict:
+            The content of the JSON annotation file loaded into a dict.
+
+    Raises:
+        ValueError:
+            A `ValueError` will be raised if the primitive cannot be found.
+    """
+    primitive = _load(name, get_primitives_paths())
+    if not primitive:
+        raise ValueError("Unknown primitive: {}".format(name))
+
+    return primitive
+
+
+def load_pipeline(name):
+    """Locate and load the pipeline JSON annotation.
+
+    All the pipeline paths will be scanned to find a JSON file with the given name,
+    and as soon as a JSON with the given name is found it is returned.
+
+    Args:
+        name (str):
+            name of the JSON to look for. The name should not contain the
+            `.json` extension, as it will be added dynamically.
+
+    Returns:
+        dict:
+            The content of the JSON annotation file loaded into a dict.
+
+    Raises:
+        ValueError:
+            A `ValueError` will be raised if the pipeline cannot be found.
+    """
+    pipeline = _load(name, get_pipelines_paths())
+    if not pipeline:
+        raise ValueError("Unknown pipeline: {}".format(name))
+
+    return pipeline
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 80f5baa2..1ab4a557 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -5,7 +5,7 @@
 import importlib
 import logging
 
-from mlblocks.primitives import load_primitive
+from mlblocks.discovery import load_primitive
 
 LOGGER = logging.getLogger(__name__)
 
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 9a0a109e..dc12b41f 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -6,6 +6,7 @@
 import logging
 from collections import Counter, OrderedDict
 
+from mlblocks.discovery import load_pipeline
 from mlblocks.mlblock import MLBlock
 
 LOGGER = logging.getLogger(__name__)
@@ -46,6 +47,12 @@ class MLPipeline():
             output_names dictionary, as given when the instance was created.
 
     Args:
+        pipeline (str, list, dict or MLPipeline):
+            The pipeline argument accepts four different types with different interpretations:
+                * `str`: the name of the pipeline to search and load.
+                * `list`: the primitives list.
+                * `dict`: a complete pipeline specification.
+                * `MLPipeline`: another pipeline to be cloned.
         primitives (list):
             List with the names of the primitives that will compose this pipeline.
         init_params (dict):
@@ -73,10 +80,9 @@ def _get_tunable_hyperparameters(self):
 
         return tunable
 
-    def __init__(self, primitives, init_params=None, input_names=None, output_names=None):
-        self.primitives = primitives
-        self.init_params = init_params or dict()
-        self.blocks = OrderedDict()
+    @staticmethod
+    def _build_blocks(primitives, init_params):
+        blocks = OrderedDict()
 
         block_names_count = Counter()
         for primitive in primitives:
@@ -84,23 +90,67 @@ def __init__(self, primitives, init_params=None, input_names=None, output_names=
                 block_names_count.update([primitive])
                 block_count = block_names_count[primitive]
                 block_name = '{}#{}'.format(primitive, block_count)
-                block_params = self.init_params.get(block_name, dict())
+                block_params = init_params.get(block_name, dict())
                 if not block_params:
-                    block_params = self.init_params.get(primitive, dict())
+                    block_params = init_params.get(primitive, dict())
                     if block_params and block_count > 1:
                         LOGGER.warning(("Non-numbered init_params are being used "
                                         "for more than one block %s."), primitive)
 
                 block = MLBlock(primitive, **block_params)
-                self.blocks[block_name] = block
+                blocks[block_name] = block
 
             except Exception:
                 LOGGER.exception("Exception caught building MLBlock %s", primitive)
                 raise
 
-        self.input_names = input_names or dict()
-        self.output_names = output_names or dict()
-        self._tunable_hyperparameters = self._get_tunable_hyperparameters()
+        return blocks
+
+    @staticmethod
+    def _get_pipeline_dict(pipeline, primitives):
+
+        if isinstance(pipeline, dict):
+            return pipeline
+
+        elif isinstance(pipeline, str):
+            return load_pipeline(pipeline)
+
+        elif isinstance(pipeline, MLPipeline):
+            return pipeline.to_dict()
+
+        elif isinstance(pipeline, list):
+            if primitives is not None:
+                raise ValueError('if `pipeline` is a `list`, `primitives` must be `None`')
+
+            return {'primitives': pipeline}
+
+        elif pipeline is None:
+            if primitives is None:
+                raise ValueError('Either `pipeline` or `primitives` must be not `None`.')
+
+            return dict()
+
+    def __init__(self, pipeline=None, primitives=None, init_params=None,
+                 input_names=None, output_names=None):
+
+        pipeline = self._get_pipeline_dict(pipeline, primitives)
+
+        self.primitives = primitives or pipeline['primitives']
+        self.init_params = init_params or pipeline.get('init_params', dict())
+        self.blocks = self._build_blocks(self.primitives, self.init_params)
+
+        self.input_names = input_names or pipeline.get('input_names', dict())
+        self.output_names = output_names or pipeline.get('output_names', dict())
+
+        tunable = pipeline.get('tunable_hyperparameters')
+        if tunable is not None:
+            self._tunable_hyperparameters = tunable
+        else:
+            self._tunable_hyperparameters = self._get_tunable_hyperparameters()
+
+        hyperparameters = pipeline.get('hyperparameters')
+        if hyperparameters:
+            self.set_hyperparameters(hyperparameters)
 
     def get_tunable_hyperparameters(self):
         """Get the tunable hyperparamters of each block.
diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py
deleted file mode 100644
index f2300f67..00000000
--- a/mlblocks/primitives.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# -*- coding: utf-8 -*-
-
-"""
-Primitives module.
-
-This module contains functions to load primitive annotations,
-as well as to configure how MLBlocks finds the primitives.
-"""
-
-import json
-import logging
-import os
-import sys
-
-import pkg_resources
-
-LOGGER = logging.getLogger(__name__)
-
-_PRIMITIVES_PATHS = [
-    os.path.join(os.getcwd(), 'mlprimitives'),
-    os.path.join(sys.prefix, 'mlprimitives'),
-    os.path.join(os.getcwd(), 'mlblocks_primitives'),    # legacy
-    os.path.join(sys.prefix, 'mlblocks_primitives'),    # legacy
-]
-
-
-def add_primitives_path(path):
-    """Add a new path to look for primitives.
-
-    The new path will be inserted in the first place of the list,
-    so any primitive found in this new folder will take precedence
-    over any other primitive with the same name that existed in the
-    system before.
-
-    Args:
-        path (str): path to add
-
-    Raises:
-        ValueError: A `ValueError` will be raised if the path is not valid.
-
-    """
-    if path not in _PRIMITIVES_PATHS:
-        if not os.path.isdir(path):
-            raise ValueError('Invalid path: {}'.format(path))
-
-        LOGGER.debug('Adding new primitives path %s', path)
-        _PRIMITIVES_PATHS.insert(0, os.path.abspath(path))
-
-
-def get_primitives_paths():
-    """Get the list of folders where the primitives will be looked for.
-
-    This list will include the value of any `entry_point` named `jsons_path` published under
-    the name `mlprimitives`.
-
-    An example of such an entry point would be::
-
-        entry_points = {
-            'mlprimitives': [
-                'jsons_path=some_module:SOME_VARIABLE'
-            ]
-        }
-
-    where the module `some_module` contains a variable such as::
-
-        SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
-
-    Returns:
-        list:
-            The list of folders.
-    """
-    primitives_paths = list()
-    entry_points = pkg_resources.iter_entry_points('mlprimitives')
-    for entry_point in entry_points:
-        if entry_point.name == 'jsons_path':
-            path = entry_point.load()
-            primitives_paths.append(path)
-
-    return _PRIMITIVES_PATHS + primitives_paths
-
-
-def load_primitive(name):
-    """Locate and load the JSON annotation of the given primitive.
-
-    All the paths found in PRIMTIVE_PATHS will be scanned to find a JSON file
-    with the given name, and as soon as a JSON with the given name is found it
-    is returned.
-
-    Args:
-        name (str): name of the primitive to look for. The name should
-                    correspond to the primitive, not to the filename, as the
-                    `.json` extension will be added dynamically.
-
-    Returns:
-        dict:
-            The content of the JSON annotation file loaded into a dict.
-
-    Raises:
-        ValueError: A `ValueError` will be raised if the primitive cannot be
-                    found.
-    """
-    for base_path in get_primitives_paths():
-        parts = name.split('.')
-        number_of_parts = len(parts)
-
-        for folder_parts in range(number_of_parts):
-            folder = os.path.join(base_path, *parts[:folder_parts])
-            filename = '.'.join(parts[folder_parts:]) + '.json'
-            json_path = os.path.join(folder, filename)
-
-            if os.path.isfile(json_path):
-                with open(json_path, 'r') as json_file:
-                    LOGGER.debug('Loading primitive %s from %s', name, json_path)
-                    return json.load(json_file)
-
-    raise ValueError("Unknown primitive: {}".format(name))
diff --git a/tests/features/test_pipeline_loading.py b/tests/features/test_pipeline_loading.py
new file mode 100644
index 00000000..bc344d63
--- /dev/null
+++ b/tests/features/test_pipeline_loading.py
@@ -0,0 +1,106 @@
+from unittest import TestCase
+from unittest.mock import Mock
+
+from mlblocks import MLPipeline
+
+
+class TestMLPipeline(TestCase):
+
+    def test_dict(self):
+        pipeline_dict = {
+            'primitives': [
+                'sklearn.ensemble.RandomForestClassifier'
+            ],
+            'init_params': {
+                'sklearn.ensemble.RandomForest#1': {
+                    'n_estimators': 500
+                }
+            },
+            'input_names': {
+                'sklearn.ensemble.RandomForest#1': {
+                    'X': 'X1'
+                }
+            },
+            'output_names': {
+                'sklearn.ensemble.RandomForest#1': {
+                    'y': 'y1'
+                }
+            }
+        }
+
+        pipeline = MLPipeline(pipeline_dict)
+
+        assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier']
+        assert pipeline.init_params == {
+            'sklearn.ensemble.RandomForest#1': {
+                'n_estimators': 500
+            }
+        }
+        assert pipeline.input_names == {
+            'sklearn.ensemble.RandomForest#1': {
+                'X': 'X1'
+            }
+        }
+        assert pipeline.output_names == {
+            'sklearn.ensemble.RandomForest#1': {
+                'y': 'y1'
+            }
+        }
+
+    def test_list(self):
+        primitives = [
+            'sklearn.ensemble.RandomForestClassifier'
+        ]
+        init_params = {
+            'sklearn.ensemble.RandomForest#1': {
+                'n_estimators': 500
+            }
+        }
+
+        pipeline = MLPipeline(primitives, init_params=init_params)
+
+        assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier']
+        assert pipeline.init_params == {
+            'sklearn.ensemble.RandomForest#1': {
+                'n_estimators': 500
+            }
+        }
+
+    def test_none(self):
+        primitives = [
+            'sklearn.ensemble.RandomForestClassifier'
+        ]
+        init_params = {
+            'sklearn.ensemble.RandomForest#1': {
+                'n_estimators': 500
+            }
+        }
+
+        pipeline = MLPipeline(primitives=primitives, init_params=init_params)
+
+        assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier']
+        assert pipeline.init_params == {
+            'sklearn.ensemble.RandomForest#1': {
+                'n_estimators': 500
+            }
+        }
+
+    def test_mlpipeline(self):
+        primitives = [
+            'sklearn.ensemble.RandomForestClassifier'
+        ]
+        init_params = {
+            'sklearn.ensemble.RandomForest#1': {
+                'n_estimators': 500
+            }
+        }
+
+        pipeline = MLPipeline(primitives=primitives, init_params=init_params)
+        pipeline2 = MLPipeline(pipeline)
+
+        assert pipeline2.primitives == ['sklearn.ensemble.RandomForestClassifier']
+        assert pipeline2.init_params == {
+            'sklearn.ensemble.RandomForest#1': {
+                'n_estimators': 500
+            }
+        }
diff --git a/tests/test_primitives.py b/tests/test_discovery.py
similarity index 60%
rename from tests/test_primitives.py
rename to tests/test_discovery.py
index 1afd17b6..3a7c3321 100644
--- a/tests/test_primitives.py
+++ b/tests/test_discovery.py
@@ -9,57 +9,57 @@
 import pytest
 from pkg_resources import Distribution, EntryPoint
 
-from mlblocks import primitives
+from mlblocks import discovery
 
 FAKE_MLPRIMITIVES_PATH = 'this/is/a/fake'
 
 
-@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
+@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
 def test_add_primitives_path_do_nothing():
-    primitives.add_primitives_path('a')
+    discovery.add_primitives_path('a')
 
-    assert primitives._PRIMITIVES_PATHS == ['a', 'b']
+    assert discovery._PRIMITIVES_PATHS == ['a', 'b']
 
 
-@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
+@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
 def test_add_primitives_path_exception():
     invalid_path = str(uuid.uuid4())
 
     with pytest.raises(ValueError):
-        primitives.add_primitives_path(invalid_path)
+        discovery.add_primitives_path(invalid_path)
 
 
-@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
+@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
 def test_add_primitives_path():
-    primitives.add_primitives_path('tests')
+    discovery.add_primitives_path('tests')
 
     expected_path = os.path.abspath('tests')
 
-    assert primitives._PRIMITIVES_PATHS == [expected_path, 'a', 'b']
+    assert discovery._PRIMITIVES_PATHS == [expected_path, 'a', 'b']
 
 
-@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
-@patch('mlblocks.primitives.pkg_resources.iter_entry_points')
+@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
+@patch('mlblocks.discovery.pkg_resources.iter_entry_points')
 def test_get_primitives_paths_no_entry_points(iep_mock):
     # setup
     iep_mock.return_value == []
 
     # run
-    paths = primitives.get_primitives_paths()
+    paths = discovery.get_primitives_paths()
 
     # assert
     assert paths == ['a', 'b']
     iep_mock.assert_called_once_with('mlprimitives')
 
 
-@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
-@patch('mlblocks.primitives.pkg_resources.iter_entry_points')
+@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
+@patch('mlblocks.discovery.pkg_resources.iter_entry_points')
 def test_get_primitives_paths_entry_points(iep_mock):
     # setup
     something_else_ep = EntryPoint('something_else', 'mlblocks.__version__')
     jsons_path_ep = EntryPoint(
         'jsons_path',
-        'tests.test_primitives',
+        'tests.test_discovery',
         attrs=['FAKE_MLPRIMITIVES_PATH'],
         dist=Distribution()
     )
@@ -69,7 +69,7 @@ def test_get_primitives_paths_entry_points(iep_mock):
     ]
 
     # run
-    paths = primitives.get_primitives_paths()
+    paths = discovery.get_primitives_paths()
 
     # assert
     expected = [
@@ -82,10 +82,10 @@ def test_get_primitives_paths_entry_points(iep_mock):
     iep_mock.assert_called_once_with('mlprimitives')
 
 
-@patch('mlblocks.primitives._PRIMITIVES_PATHS', new=['a', 'b'])
+@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
 def test_load_primitive_value_error():
     with pytest.raises(ValueError):
-        primitives.load_primitive('invalid.primitive')
+        discovery.load_primitive('invalid.primitive')
 
 
 def test_load_primitive_success():
@@ -95,11 +95,11 @@ def test_load_primitive_success():
     }
 
     with tempfile.TemporaryDirectory() as tempdir:
-        primitives.add_primitives_path(tempdir)
+        discovery.add_primitives_path(tempdir)
         primitive_path = os.path.join(tempdir, 'temp.primitive.json')
         with open(primitive_path, 'w') as primitive_file:
             json.dump(primitive, primitive_file, indent=4)
 
-        loaded = primitives.load_primitive('temp.primitive')
+        loaded = discovery.load_primitive('temp.primitive')
 
         assert primitive == loaded
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 2fa6d097..741be194 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -39,7 +39,11 @@ def test___init__(self, mlblock_mock, logger_mock):
         }
         expected_input_names = input_names.copy()
 
-        mlpipeline = MLPipeline(primitives, init_params, input_names)
+        mlpipeline = MLPipeline(
+            primitives=primitives,
+            init_params=init_params,
+            input_names=input_names
+        )
 
         assert mlpipeline.primitives == expected_primitives
         assert mlpipeline.init_params == expected_init_params

From eb36fcb12f79401c776b0269be35b7c64e1ea22d Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 21 May 2019 13:54:02 -0400
Subject: [PATCH 023/160] Fix docs

---
 docs/advanced_usage/pipelines.rst       |  2 +-
 docs/api/mlblocks.discovery.rst         |  5 +++
 docs/api/mlblocks.primitives.rst        |  5 ---
 docs/getting_started/quickstart.rst     |  2 +-
 docs/index.rst                          |  2 +-
 mlblocks/discovery.py                   | 34 +++++++--------
 mlblocks/mlblock.py                     | 31 +++++++-------
 mlblocks/mlpipeline.py                  | 57 +++++++++++++------------
 tests/features/test_pipeline_loading.py |  1 -
 9 files changed, 70 insertions(+), 69 deletions(-)
 create mode 100644 docs/api/mlblocks.discovery.rst
 delete mode 100644 docs/api/mlblocks.primitives.rst

diff --git a/docs/advanced_usage/pipelines.rst b/docs/advanced_usage/pipelines.rst
index cc7ccc49..33d57cdc 100644
--- a/docs/advanced_usage/pipelines.rst
+++ b/docs/advanced_usage/pipelines.rst
@@ -86,7 +86,7 @@ This can be done by passing an extra dictionary to the MLPipeline when it is cre
             'n_estimators': 100
         }
     }
-    pipeline = MLPipeline(primitives, init_params)
+    pipeline = MLPipeline(primitives, init_params=init_params)
 
 This dictionary must have as keys the name of the blocks that the arguments belong to, and
 as values the dictionary that contains the argument names and their values.
diff --git a/docs/api/mlblocks.discovery.rst b/docs/api/mlblocks.discovery.rst
new file mode 100644
index 00000000..c9109130
--- /dev/null
+++ b/docs/api/mlblocks.discovery.rst
@@ -0,0 +1,5 @@
+mlblocks.discovery
+==================
+
+.. automodule:: mlblocks.discovery
+    :members:
diff --git a/docs/api/mlblocks.primitives.rst b/docs/api/mlblocks.primitives.rst
deleted file mode 100644
index d625c774..00000000
--- a/docs/api/mlblocks.primitives.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-mlblocks.primitives
-===================
-
-.. automodule:: mlblocks.primitives
-    :members:
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
index 2115fcef..c3edf475 100644
--- a/docs/getting_started/quickstart.rst
+++ b/docs/getting_started/quickstart.rst
@@ -38,7 +38,7 @@ Optionally, specific `hyperparameters`_ can be also set by specifying them in a
             'n_estimators': 100
         }
     }
-    pipeline = MLPipeline(primitives, hyperparameters)
+    pipeline = MLPipeline(primitives, init_params=hyperparameters)
 
 Once the pipeline has been instantiated, we can easily see what `hyperparameters`_ have been set
 for each block, by calling the `get_hyperparameters method`_.
diff --git a/docs/index.rst b/docs/index.rst
index 2bb4c5a9..c3655b3c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -74,7 +74,7 @@ integrate with deep learning libraries.
 
    api/mlblocks
    api/mlblocks.datasets
-   api/mlblocks.primitives
+   api/mlblocks.discovery
 
 .. toctree::
    :caption: Resources
diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index 78f12021..1f952b81 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -42,7 +42,7 @@ def _add_lookup_path(path, paths):
 
     Raises:
         ValueError:
-            A `ValueError` will be raised if the path is not valid.
+            A ``ValueError`` will be raised if the path is not valid.
 
     """
     if path not in paths:
@@ -67,7 +67,7 @@ def add_primitives_path(path):
 
     Raises:
         ValueError:
-            A `ValueError` will be raised if the path is not valid.
+            A ``ValueError`` will be raised if the path is not valid.
     """
     added = _add_lookup_path(path, _PRIMITIVES_PATHS)
     if added:
@@ -88,7 +88,7 @@ def add_pipelines_path(path):
 
     Raises:
         ValueError:
-            A `ValueError` will be raised if the path is not valid.
+            A ``ValueError`` will be raised if the path is not valid.
     """
     added = _add_lookup_path(path, _PIPELINES_PATHS)
     if added:
@@ -98,7 +98,7 @@ def add_pipelines_path(path):
 def _get_lookup_paths(entry_point):
     """Get the list of folders where elements will be looked for.
 
-    This list will include the value of any `entry_point` named `jsons_path` published under
+    This list will include the value of any ``entry_point`` named ``jsons_path`` published under
     the entry_point name.
 
     An example of such an entry point would be::
@@ -109,13 +109,13 @@ def _get_lookup_paths(entry_point):
             ]
         }
 
-    where the module `some_module` contains a variable such as::
+    where the module ``some_module`` contains a variable such as::
 
         SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
 
     Args:
         entry_point:
-            The name of the `entry_point` to look for.
+            The name of the ``entry_point`` to look for.
 
     Returns:
         list:
@@ -134,8 +134,8 @@ def _get_lookup_paths(entry_point):
 def get_primitives_paths():
     """Get the list of folders where primitives will be looked for.
 
-    This list will include the value of any `entry_point` named `jsons_path` published under
-    the `mlprimitives` name.
+    This list will include the value of any ``entry_point`` named ``jsons_path`` published under
+    the ``mlprimitives`` name.
 
     An example of such an entry point would be::
 
@@ -145,7 +145,7 @@ def get_primitives_paths():
             ]
         }
 
-    where the module `some_module` contains a variable such as::
+    where the module ``some_module`` contains a variable such as::
 
         SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
 
@@ -159,8 +159,8 @@ def get_primitives_paths():
 def get_pipelines_paths():
     """Get the list of folders where pipelines will be looked for.
 
-    This list will include the value of any `entry_point` named `jsons_path` published under
-    the `mlpipelines` name.
+    This list will include the value of any ``entry_point`` named ``jsons_path`` published under
+    the ``mlpipelines`` name.
 
     An example of such an entry point would be::
 
@@ -170,7 +170,7 @@ def get_pipelines_paths():
             ]
         }
 
-    where the module `some_module` contains a variable such as::
+    where the module ``some_module`` contains a variable such as::
 
         SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
 
@@ -190,7 +190,7 @@ def _load(name, paths):
     Args:
         name (str):
             name of the JSON to look for. The name should not contain the
-            `.json` extension, as it will be added dynamically.
+            ``.json`` extension, as it will be added dynamically.
 
     Returns:
         dict:
@@ -220,7 +220,7 @@ def load_primitive(name):
     Args:
         name (str):
             name of the JSON to look for. The name should not contain the
-            `.json` extension, as it will be added dynamically.
+            ``.json`` extension, as it will be added dynamically.
 
     Returns:
         dict:
@@ -228,7 +228,7 @@ def load_primitive(name):
 
     Raises:
         ValueError:
-            A `ValueError` will be raised if the primitive cannot be found.
+            A ``ValueError`` will be raised if the primitive cannot be found.
     """
     primitive = _load(name, get_primitives_paths())
     if not primitive:
@@ -246,7 +246,7 @@ def load_pipeline(name):
     Args:
         name (str):
             name of the JSON to look for. The name should not contain the
-            `.json` extension, as it will be added dynamically.
+            ``.json`` extension, as it will be added dynamically.
 
     Returns:
         dict:
@@ -254,7 +254,7 @@ def load_pipeline(name):
 
     Raises:
         ValueError:
-            A `ValueError` will be raised if the pipeline cannot be found.
+            A ``ValueError`` will be raised if the pipeline cannot be found.
     """
     pipeline = _load(name, get_pipelines_paths())
     if not pipeline:
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 1ab4a557..66bbf8fe 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -30,15 +30,16 @@ class MLBlock():
         primitive (object):
             the actual function or instance which this MLBlock wraps.
         fit_args (dict):
-            specification of the arguments expected by the `fit` method.
+            specification of the arguments expected by the ``fit`` method.
         fit_method (str):
-            name of the primitive method to call on `fit`. `None` if the primitive is a function.
+            name of the primitive method to call on ``fit``. ``None`` if the
+            primitive is a function.
         produce_args (dict):
-            specification of the arguments expected by the `predict` method.
+            specification of the arguments expected by the ``predict`` method.
         produce_output (dict):
-            specification of the outputs of the `produce` method.
+            specification of the outputs of the ``produce`` method.
         produce_method (str):
-            name of the primitive method to call on `produce`. `None` if the primitive is a
+            name of the primitive method to call on ``produce``. ``None`` if the primitive is a
             function.
 
     Args:
@@ -46,19 +47,19 @@ class MLBlock():
             Name given to this MLBlock.
         **kwargs:
             Any additional arguments that will be used as hyperparameters or passed to the
-            `fit` or `produce` methods.
+            ``fit`` or ``produce`` methods.
 
     Raises:
         TypeError:
-            A `TypeError` is raised if a required argument is not found within the `kwargs`
+            A ``TypeError`` is raised if a required argument is not found within the ``kwargs``
             or if an unexpected argument has been given.
     """  # pylint: disable=too-many-instance-attributes
 
     def _extract_params(self, kwargs, hyperparameters):
         """Extract init, fit and produce params from kwargs.
 
-        The `init_params`, `fit_params` and `produce_params` are extracted
-        from the passed `kwargs` taking the metadata hyperparameters as a
+        The ``init_params``, ``fit_params`` and ``produce_params`` are extracted
+        from the passed ``kwargs`` taking the metadata hyperparameters as a
         reference.
 
         During this extraction, make sure that all the required hyperparameters
@@ -66,15 +67,15 @@ def _extract_params(self, kwargs, hyperparameters):
 
         Args:
             kwargs (dict):
-                dict containing the Keyword arguments that have been passed to the `__init__`
+                dict containing the Keyword arguments that have been passed to the ``__init__``
                 method upon initialization.
             hyperparameters (dict):
                 hyperparameters dictionary, as found in the JSON annotation.
 
         Raises:
             TypeError:
-                A `TypeError` is raised if a required argument is not found in the `kwargs` dict,
-                or if an unexpected argument has been given.
+                A ``TypeError`` is raised if a required argument is not found in the
+                ``kwargs`` dict, or if an unexpected argument has been given.
         """
         init_params = dict()
         fit_params = dict()
@@ -262,7 +263,7 @@ def _get_method_kwargs(self, kwargs, method_args):
     def fit(self, **kwargs):
         """Call the fit method of the primitive.
 
-        The given keyword arguments will be passed directly to the `fit`
+        The given keyword arguments will be passed directly to the ``fit``
         method of the primitive instance specified in the JSON annotation.
 
         If any of the arguments expected by the produce method had been
@@ -277,7 +278,7 @@ def fit(self, **kwargs):
 
         Raises:
             TypeError:
-                A `TypeError` might be raised if any argument not expected by the primitive fit
+                A ``TypeError`` might be raised if any argument not expected by the primitive fit
                 method is given.
         """
         if self.fit_method is not None:
@@ -290,7 +291,7 @@ def produce(self, **kwargs):
         """Call the primitive function, or the predict method of the primitive.
 
         The given keyword arguments will be passed directly to the primitive,
-        if it is a simple function, or to the `produce` method of the
+        if it is a simple function, or to the ``produce`` method of the
         primitive instance specified in the JSON annotation, if it is a class.
 
         If any of the arguments expected by the fit method had been given
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index dc12b41f..b73d96b9 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -58,7 +58,7 @@ class MLPipeline():
         init_params (dict):
             dictionary containing initialization arguments to be passed when creating the
             MLBlocks instances. The dictionary keys must be the corresponding primitive names
-            and the values must be another dictionary that will be passed as `**kargs` to the
+            and the values must be another dictionary that will be passed as ``**kargs`` to the
             MLBlock instance.
         input_names (dict):
             dictionary that maps input variable names with the actual names expected by each
@@ -191,7 +191,7 @@ def _get_block_args(self, block_name, block_args, context):
         """Get the arguments expected by the block method from the context.
 
         The arguments will be taken from the context using both the method
-        arguments specification and the `input_names` given when the pipeline
+        arguments specification and the ``input_names`` given when the pipeline
         was created.
 
         Args:
@@ -245,7 +245,7 @@ def _extract_outputs(self, block_name, outputs, block_outputs):
         return output_dict
 
     def _get_block_name(self, index):
-        """Get the name of the block in the `index` position."""
+        """Get the name of the block in the ``index`` position."""
         return list(self.blocks.keys())[index]
 
     def _get_output_spec(self, output):
@@ -338,14 +338,14 @@ def _get_output(self, output_variable, context):
     def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
         """Fit the blocks of this pipeline.
 
-        Sequentially call the `fit` and the `produce` methods of each block,
-        capturing the outputs each `produce` method before calling the `fit`
+        Sequentially call the ``fit`` and the ``produce`` methods of each block,
+        capturing the outputs each ``produce`` method before calling the ``fit``
         method of the next one.
 
         During the whole process a context dictionary is built, where both the
-        passed arguments and the captured outputs of the `produce` methods
-        are stored, and from which the arguments for the next `fit` and
-        `produce` calls will be taken.
+        passed arguments and the captured outputs of the ``produce`` methods
+        are stored, and from which the arguments for the next ``fit`` and
+        ``produce`` calls will be taken.
 
         Args:
             X:
@@ -451,12 +451,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
     def predict(self, X=None, output_=None, start_=None, **kwargs):
         """Produce predictions using the blocks of this pipeline.
 
-        Sequentially call the `produce` method of each block, capturing the
+        Sequentially call the ``produce`` method of each block, capturing the
         outputs before calling the next one.
 
         During the whole process a context dictionary is built, where both the
-        passed arguments and the captured outputs of the `produce` methods
-        are stored, and from which the arguments for the next `produce` calls
+        passed arguments and the captured outputs of the ``produce`` methods
+        are stored, and from which the arguments for the next ``produce`` calls
         will be taken.
 
         Args:
@@ -550,7 +550,7 @@ def predict(self, X=None, output_=None, start_=None, **kwargs):
     def to_dict(self):
         """Return all the details of this MLPipeline in a dict.
 
-        The dict structure contains all the `__init__` arguments of the
+        The dict structure contains all the ``__init__`` arguments of the
         MLPipeline, as well as the current hyperparameter values and the
         specification of the tunable_hyperparameters::
 
@@ -599,7 +599,7 @@ def to_dict(self):
     def save(self, path):
         """Save the specification of this MLPipeline in a JSON file.
 
-        The content of the JSON file is the dict returned by the `to_dict` method.
+        The content of the JSON file is the dict returned by the ``to_dict`` method.
 
         Args:
             path (str):
@@ -612,7 +612,7 @@ def save(self, path):
     def from_dict(cls, metadata):
         """Create a new MLPipeline from a dict specification.
 
-        The dict structure is the same as the one created by the `to_dict` method.
+        The dict structure is the same as the one created by the ``to_dict`` method.
 
         Args:
             metadata (dict):
@@ -623,29 +623,30 @@ def from_dict(cls, metadata):
                 A new MLPipeline instance with the details found in the
                 given specification dictionary.
         """
-        hyperparameters = metadata.get('hyperparameters')
-        tunable = metadata.get('tunable_hyperparameters')
+        # hyperparameters = metadata.get('hyperparameters')
+        # tunable = metadata.get('tunable_hyperparameters')
 
-        pipeline = cls(
-            metadata['primitives'],
-            metadata.get('init_params'),
-            metadata.get('input_names'),
-            metadata.get('output_names'),
-        )
+        # pipeline = cls(
+        #     metadata['primitives'],
+        #     metadata.get('init_params'),
+        #     metadata.get('input_names'),
+        #     metadata.get('output_names'),
+        # )
 
-        if hyperparameters:
-            pipeline.set_hyperparameters(hyperparameters)
+        # if hyperparameters:
+        #     pipeline.set_hyperparameters(hyperparameters)
 
-        if tunable is not None:
-            pipeline._tunable_hyperparameters = tunable
+        # if tunable is not None:
+        #     pipeline._tunable_hyperparameters = tunable
 
-        return pipeline
+        # return pipeline
+        return cls(metadata)
 
     @classmethod
     def load(cls, path):
         """Create a new MLPipeline from a JSON specification.
 
-        The JSON file format is the same as the one created by the `to_dict` method.
+        The JSON file format is the same as the one created by the ``to_dict`` method.
 
         Args:
             path (str):
diff --git a/tests/features/test_pipeline_loading.py b/tests/features/test_pipeline_loading.py
index bc344d63..4b363d07 100644
--- a/tests/features/test_pipeline_loading.py
+++ b/tests/features/test_pipeline_loading.py
@@ -1,5 +1,4 @@
 from unittest import TestCase
-from unittest.mock import Mock
 
 from mlblocks import MLPipeline
 

From d97ad54e547665488fd2dcea21ec8369d95fcb7f Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 21 May 2019 13:56:51 -0400
Subject: [PATCH 024/160] Update the readme to the latest API changes

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index fb8d3885..cd454b73 100644
--- a/README.md
+++ b/README.md
@@ -81,10 +81,10 @@ them to the `MLPipeline` class.
 >>> pipeline = MLPipeline(primitives)
 ```
 
-Optionally, specific hyperparameters can be also set by specifying them in a dictionary:
+Optionally, specific initialization arguments can be also set by specifying them in a dictionary:
 
 ```python
->>> hyperparameters = {
+>>> init_params = {
 ...    'skimage.feature.hog': {
 ...        'multichannel': True,
 ...        'visualize': False
@@ -93,7 +93,7 @@ Optionally, specific hyperparameters can be also set by specifying them in a dic
 ...         'n_estimators': 100,
 ...    }
 ... }
->>> pipeline = MLPipeline(primitives, hyperparameters)
+>>> pipeline = MLPipeline(primitives, init_params=init_params)
 ```
 
 If you can see which hyperparameters a particular pipeline is using, you can do so by calling

From 221cfb82ac9f6f7cd413043429916a5528567b0e Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 21 May 2019 16:24:00 -0400
Subject: [PATCH 025/160] Remove commented code

---
 mlblocks/mlpipeline.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index b73d96b9..ce31780f 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -623,23 +623,6 @@ def from_dict(cls, metadata):
                 A new MLPipeline instance with the details found in the
                 given specification dictionary.
         """
-        # hyperparameters = metadata.get('hyperparameters')
-        # tunable = metadata.get('tunable_hyperparameters')
-
-        # pipeline = cls(
-        #     metadata['primitives'],
-        #     metadata.get('init_params'),
-        #     metadata.get('input_names'),
-        #     metadata.get('output_names'),
-        # )
-
-        # if hyperparameters:
-        #     pipeline.set_hyperparameters(hyperparameters)
-
-        # if tunable is not None:
-        #     pipeline._tunable_hyperparameters = tunable
-
-        # return pipeline
         return cls(metadata)
 
     @classmethod

From 197c47f3cc7bbe6e683a971866bbd9e52b9821d9 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 21 May 2019 17:05:49 -0400
Subject: [PATCH 026/160] Add instructions to install MLPrimitives

---
 README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/README.md b/README.md
index cd454b73..01629dc8 100644
--- a/README.md
+++ b/README.md
@@ -58,11 +58,26 @@ make install
 For development, you can use `make install-develop` instead in order to install all
 the required dependencies for testing and code linting.
 
+## MLPrimitives
+
+In order to be usable, MLBlocks requires a compatible primitives library.
+
+The official library, required in order to follow the following MLBlocks tutorial,
+is [MLPrimitives](https://github.com/HDI-Project/MLPrimitives), which you can install
+with this command:
+
+```bash
+pip install mlprimitives
+```
+
 # Usage Example
 
 Below there is a short example about how to use MLBlocks to create a simple pipeline, fit it
 using demo data and use it to make predictions.
 
+Please make sure to having installed [MLPrimitives](https://github.com/HDI-Project/MLPrimitives)
+before following it.
+
 For advance usage and more detailed explanation about each component, please have a look
 at the [documentation](https://HDI-Project.github.io/MLBlocks)
 

From d451c7c3d2f9eb4972f8a1c38edbb468410b7d44 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 22 May 2019 15:12:57 -0400
Subject: [PATCH 027/160] Address PR feedback

---
 docs/getting_started/quickstart.rst |  7 ++++---
 mlblocks/discovery.py               |  9 +++++++++
 mlblocks/mlpipeline.py              | 11 +++++------
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
index c3edf475..2887da05 100644
--- a/docs/getting_started/quickstart.rst
+++ b/docs/getting_started/quickstart.rst
@@ -29,16 +29,17 @@ them to the `MLPipeline class`_:
     ]
     pipeline = MLPipeline(primitives)
 
-Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary:
+Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary and
+passing them as the ``init_params`` argument:
 
 .. ipython:: python
 
-    hyperparameters = {
+    init_params = {
         'sklearn.ensemble.RandomForestClassifier': {
             'n_estimators': 100
         }
     }
-    pipeline = MLPipeline(primitives, init_params=hyperparameters)
+    pipeline = MLPipeline(primitives, init_params=init_params)
 
 Once the pipeline has been instantiated, we can easily see what `hyperparameters`_ have been set
 for each block, by calling the `get_hyperparameters method`_.
diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index 1f952b81..51ff13cd 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -39,11 +39,16 @@ def _add_lookup_path(path, paths):
     Args:
         path (str):
             path to add
+        paths (list):
+            list where the new path will be added.
 
     Raises:
         ValueError:
             A ``ValueError`` will be raised if the path is not valid.
 
+    Returns:
+        bool:
+            Whether the new path was added or not.
     """
     if path not in paths:
         if not os.path.isdir(path):
@@ -52,6 +57,8 @@ def _add_lookup_path(path, paths):
         paths.insert(0, os.path.abspath(path))
         return True
 
+    return False
+
 
 def add_primitives_path(path):
     """Add a new path to look for primitives.
@@ -191,6 +198,8 @@ def _load(name, paths):
         name (str):
             name of the JSON to look for. The name should not contain the
             ``.json`` extension, as it will be added dynamically.
+        paths (list):
+            list of paths where the primitives will be looked for.
 
     Returns:
         dict:
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index ce31780f..b31502ea 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -80,19 +80,18 @@ def _get_tunable_hyperparameters(self):
 
         return tunable
 
-    @staticmethod
-    def _build_blocks(primitives, init_params):
+    def _build_blocks(self):
         blocks = OrderedDict()
 
         block_names_count = Counter()
-        for primitive in primitives:
+        for primitive in self.primitives:
             try:
                 block_names_count.update([primitive])
                 block_count = block_names_count[primitive]
                 block_name = '{}#{}'.format(primitive, block_count)
-                block_params = init_params.get(block_name, dict())
+                block_params = self.init_params.get(block_name, dict())
                 if not block_params:
-                    block_params = init_params.get(primitive, dict())
+                    block_params = self.init_params.get(primitive, dict())
                     if block_params and block_count > 1:
                         LOGGER.warning(("Non-numbered init_params are being used "
                                         "for more than one block %s."), primitive)
@@ -137,7 +136,7 @@ def __init__(self, pipeline=None, primitives=None, init_params=None,
 
         self.primitives = primitives or pipeline['primitives']
         self.init_params = init_params or pipeline.get('init_params', dict())
-        self.blocks = self._build_blocks(self.primitives, self.init_params)
+        self.blocks = self._build_blocks()
 
         self.input_names = input_names or pipeline.get('input_names', dict())
         self.output_names = output_names or pipeline.get('output_names', dict())

From 8b2b7aaecd72637d9769bfb9ad94025f242e2872 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 22 May 2019 16:16:27 -0400
Subject: [PATCH 028/160] rename mlprimitives.jsons_path to mlblocks.primitives
 and support multiple paths

---
 mlblocks/discovery.py | 48 +++++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index 51ff13cd..b5ca840d 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -102,17 +102,17 @@ def add_pipelines_path(path):
         LOGGER.debug('New pipelines path added: %s', path)
 
 
-def _get_lookup_paths(entry_point):
-    """Get the list of folders where elements will be looked for.
+def _load_entry_points(entry_point_name, entry_point_group='mlblocks'):
+    """Get a list of folders from entry points.
 
-    This list will include the value of any ``entry_point`` named ``jsons_path`` published under
-    the entry_point name.
+    This list will include the value of any entry point named after the given
+    ``entry_point_name`` published under the given ``entry_point_group``.
 
     An example of such an entry point would be::
 
         entry_points = {
-            'mlprimitives': [
-                'jsons_path=some_module:SOME_VARIABLE'
+            'mlblocks': [
+                'primitives=some_module:SOME_VARIABLE'
             ]
         }
 
@@ -129,11 +129,14 @@ def _get_lookup_paths(entry_point):
             The list of folders.
     """
     lookup_paths = list()
-    entry_points = pkg_resources.iter_entry_points(entry_point)
+    entry_points = pkg_resources.iter_entry_points(entry_point_group)
     for entry_point in entry_points:
-        if entry_point.name == 'jsons_path':
-            path = entry_point.load()
-            lookup_paths.append(path)
+        if entry_point.name == entry_point_name:
+            paths = entry_point.load()
+            if isinstance(paths, str):
+                lookup_paths.append(paths)
+            elif isinstance(paths, (list, tuple)):
+                lookup_paths.extend(paths)
 
     return lookup_paths
 
@@ -141,14 +144,18 @@ def _get_lookup_paths(entry_point):
 def get_primitives_paths():
     """Get the list of folders where primitives will be looked for.
 
-    This list will include the value of any ``entry_point`` named ``jsons_path`` published under
-    the ``mlprimitives`` name.
+    This list will include the values of all the entry points named ``primitives``
+    published under the entry point group ``mlblocks``.
+
+    Also, for backwards compatibility reasons, the paths from the entry points
+    named ``jsons_path`` published under the ``mlprimitives`` group will also
+    be included.
 
     An example of such an entry point would be::
 
         entry_points = {
-            'mlprimitives': [
-                'jsons_path=some_module:SOME_VARIABLE'
+            'mlblocks': [
+                'primitives=some_module:SOME_VARIABLE'
             ]
         }
 
@@ -160,20 +167,21 @@ def get_primitives_paths():
         list:
             The list of folders.
     """
-    return _PRIMITIVES_PATHS + _get_lookup_paths('mlprimitives')
+    paths = _load_entry_points('primitives') + _load_entry_points('jsons_path', 'mlprimitives')
+    return _PRIMITIVES_PATHS + paths
 
 
 def get_pipelines_paths():
     """Get the list of folders where pipelines will be looked for.
 
-    This list will include the value of any ``entry_point`` named ``jsons_path`` published under
-    the ``mlpipelines`` name.
+    This list will include the values of all the entry points named ``pipelines``
+    published under the entry point group ``mlblocks``.
 
     An example of such an entry point would be::
 
         entry_points = {
-            'mlpipelines': [
-                'jsons_path=some_module:SOME_VARIABLE'
+            'mlblocks': [
+                'pipelines=some_module:SOME_VARIABLE'
             ]
         }
 
@@ -185,7 +193,7 @@ def get_pipelines_paths():
         list:
             The list of folders.
     """
-    return _PIPELINES_PATHS + _get_lookup_paths('mlpipelines')
+    return _PIPELINES_PATHS + _load_entry_points('pipelines')
 
 
 def _load(name, paths):

From cc012b013b27f8f301eda8adc8edcc2a79a37c57 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 22 May 2019 16:16:41 -0400
Subject: [PATCH 029/160] Add unit tests for mlblocks discovery

---
 tests/test_discovery.py | 151 ++++++++++++++++++++++++++++++++--------
 1 file changed, 122 insertions(+), 29 deletions(-)

diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index 3a7c3321..59bd4404 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -4,7 +4,7 @@
 import os
 import tempfile
 import uuid
-from unittest.mock import patch
+from unittest.mock import call, patch
 
 import pytest
 from pkg_resources import Distribution, EntryPoint
@@ -14,92 +14,185 @@
 FAKE_MLPRIMITIVES_PATH = 'this/is/a/fake'
 
 
-@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
-def test_add_primitives_path_do_nothing():
-    discovery.add_primitives_path('a')
+def test__add_lookup_path_do_nothing():
+    paths = ['a', 'b']
+    discovery._add_lookup_path('a', paths)
 
-    assert discovery._PRIMITIVES_PATHS == ['a', 'b']
+    assert paths == ['a', 'b']
 
 
-@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
-def test_add_primitives_path_exception():
+def test__add_lookup_path_exception():
+    paths = ['a', 'b']
     invalid_path = str(uuid.uuid4())
 
     with pytest.raises(ValueError):
-        discovery.add_primitives_path(invalid_path)
+        discovery._add_lookup_path(invalid_path, paths)
+
+
+def test__add_lookup_path():
+    paths = ['a', 'b']
+    discovery._add_lookup_path('tests', paths)
+
+    expected_path = os.path.abspath('tests')
+
+    assert paths == [expected_path, 'a', 'b']
 
 
 @patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
 def test_add_primitives_path():
-    discovery.add_primitives_path('tests')
+    discovery.add_primitives_path(os.path.abspath('tests'))
 
     expected_path = os.path.abspath('tests')
-
     assert discovery._PRIMITIVES_PATHS == [expected_path, 'a', 'b']
 
 
+@patch('mlblocks.discovery._PIPELINES_PATHS', new=['a', 'b'])
+def test_add_pipelines_path():
+    discovery.add_pipelines_path('tests')
+
+    expected_path = os.path.abspath('tests')
+    assert discovery._PIPELINES_PATHS == [expected_path, 'a', 'b']
+
+
 @patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
 @patch('mlblocks.discovery.pkg_resources.iter_entry_points')
-def test_get_primitives_paths_no_entry_points(iep_mock):
+def test__load_entry_points_no_entry_points(iep_mock):
     # setup
     iep_mock.return_value == []
 
     # run
-    paths = discovery.get_primitives_paths()
+    paths = discovery._load_entry_points('jsons_path', 'mlprimitives')
 
     # assert
-    assert paths == ['a', 'b']
-    iep_mock.assert_called_once_with('mlprimitives')
+    assert paths == []
+    expected_calls = [
+        call('mlprimitives'),
+    ]
+    assert iep_mock.call_args_list == expected_calls
 
 
-@patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
 @patch('mlblocks.discovery.pkg_resources.iter_entry_points')
-def test_get_primitives_paths_entry_points(iep_mock):
+def test__load_entry_points_entry_points(iep_mock):
     # setup
     something_else_ep = EntryPoint('something_else', 'mlblocks.__version__')
-    jsons_path_ep = EntryPoint(
-        'jsons_path',
+    primitives_ep = EntryPoint(
+        'primitives',
         'tests.test_discovery',
         attrs=['FAKE_MLPRIMITIVES_PATH'],
         dist=Distribution()
     )
     iep_mock.return_value = [
         something_else_ep,
-        jsons_path_ep
+        primitives_ep
     ]
 
     # run
-    paths = discovery.get_primitives_paths()
+    paths = discovery._load_entry_points('primitives')
 
     # assert
     expected = [
-        'a',
-        'b',
         'this/is/a/fake'
     ]
     assert paths == expected
 
-    iep_mock.assert_called_once_with('mlprimitives')
+    expected_calls = [
+        call('mlblocks'),
+    ]
+    assert iep_mock.call_args_list == expected_calls
 
 
 @patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
-def test_load_primitive_value_error():
-    with pytest.raises(ValueError):
-        discovery.load_primitive('invalid.primitive')
+@patch('mlblocks.discovery._load_entry_points')
+def test_get_primitives_paths(lep_mock):
+    lep_mock.side_effect = [['c'], []]
+
+    paths = discovery.get_primitives_paths()
+
+    assert paths == ['a', 'b', 'c']
+    expected_calls = [
+        call('primitives'),
+        call('jsons_path', 'mlprimitives'),
+    ]
+    assert lep_mock.call_args_list == expected_calls
+
+
+@patch('mlblocks.discovery._PIPELINES_PATHS', new=['a', 'b'])
+@patch('mlblocks.discovery._load_entry_points')
+def test_get_pipelines_paths(lep_mock):
+    lep_mock.return_value = ['c']
 
+    paths = discovery.get_pipelines_paths()
 
-def test_load_primitive_success():
+    assert paths == ['a', 'b', 'c']
+    lep_mock.assert_called_once_with('pipelines')
+
+
+def test__load_value_error():
+    primitive = discovery._load('invalid.primitive', ['a', 'b'])
+
+    assert primitive is None
+
+
+def test__load_success():
     primitive = {
         'name': 'temp.primitive',
         'primitive': 'temp.primitive'
     }
 
     with tempfile.TemporaryDirectory() as tempdir:
-        discovery.add_primitives_path(tempdir)
+        paths = [tempdir]
         primitive_path = os.path.join(tempdir, 'temp.primitive.json')
         with open(primitive_path, 'w') as primitive_file:
             json.dump(primitive, primitive_file, indent=4)
 
-        loaded = discovery.load_primitive('temp.primitive')
+        loaded = discovery._load('temp.primitive', paths)
 
         assert primitive == loaded
+
+
+@patch('mlblocks.discovery.get_primitives_paths')
+@patch('mlblocks.discovery._load')
+def test__load_primitive_value_error(load_mock, gpp_mock):
+    load_mock.return_value = None
+    gpp_mock.return_value = ['a', 'b']
+
+    with pytest.raises(ValueError):
+        discovery.load_primitive('invalid.primitive')
+
+    load_mock.assert_called_once_with('invalid.primitive', ['a', 'b'])
+
+
+@patch('mlblocks.discovery.get_primitives_paths')
+@patch('mlblocks.discovery._load')
+def test__load_primitive_success(load_mock, gpp_mock):
+    gpp_mock.return_value = ['a', 'b']
+
+    primitive = discovery.load_primitive('valid.primitive')
+
+    load_mock.assert_called_once_with('valid.primitive', ['a', 'b'])
+
+    assert primitive == load_mock.return_value
+
+
+@patch('mlblocks.discovery.get_pipelines_paths')
+@patch('mlblocks.discovery._load')
+def test__load_pipeline_value_error(load_mock, gpp_mock):
+    load_mock.return_value = None
+    gpp_mock.return_value = ['a', 'b']
+
+    with pytest.raises(ValueError):
+        discovery.load_pipeline('invalid.pipeline')
+
+    load_mock.assert_called_once_with('invalid.pipeline', ['a', 'b'])
+
+
+@patch('mlblocks.discovery.get_pipelines_paths')
+@patch('mlblocks.discovery._load')
+def test__load_pipeline_success(load_mock, gpp_mock):
+    gpp_mock.return_value = ['a', 'b']
+
+    pipeline = discovery.load_pipeline('valid.pipeline')
+
+    load_mock.assert_called_once_with('valid.pipeline', ['a', 'b'])
+
+    assert pipeline == load_mock.return_value

From e5de2532b0c83d27c72efc615bd8c680720a5f2d Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 22 May 2019 19:39:52 -0400
Subject: [PATCH 030/160] Update docs about primitives entry_points

---
 docs/advanced_usage/adding_primitives.rst | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/docs/advanced_usage/adding_primitives.rst b/docs/advanced_usage/adding_primitives.rst
index e3d4b964..9d358629 100644
--- a/docs/advanced_usage/adding_primitives.rst
+++ b/docs/advanced_usage/adding_primitives.rst
@@ -91,20 +91,27 @@ In order to make **MLBLocks** able to find the primitives defined in such a libr
 all you need to do is setting up an `Entry Point`_ in your `setup.py` script with the
 following specification:
 
-1. It has to be published under the name ``mlprimitives``.
-2. It has to be named exactly ``jsons_path``.
-3. It has to point at a variable that contains the path to the JSONS folder.
+1. It has to be published under the group ``mlblocks``.
+2. It has to be named exactly ``primitives``.
+3. It has to point at a variable that contains a path or a list of paths to the JSONS folder(s).
 
 An example of such an entry point would be::
 
     entry_points = {
-        'mlprimitives': [
-            'jsons_path=some_module:SOME_VARIABLE'
+        'mlblocks': [
+            'primitives=some_module:SOME_VARIABLE'
         ]
     }
 
 where the module `some_module` contains a variable such as::
 
-    SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
+    SOME_VARIABLE = 'path/to/primitives'
+
+or::
+
+    SOME_VARIABLE = [
+        'path/to/primitives',
+        'path/to/more/primitives'
+    ]
 
 .. _Entry Point: https://packaging.python.org/specifications/entry-points/

From 67df52a740a2c4cfe2d0bad9de9168c70723634c Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 23 May 2019 18:11:44 -0400
Subject: [PATCH 031/160] Add functions to explore pipelines and primitives

---
 mlblocks/discovery.py   | 77 +++++++++++++++++++++++++++++++++++++----
 mlblocks/mlblock.py     | 12 ++++---
 tests/test_discovery.py |  4 +--
 3 files changed, 79 insertions(+), 14 deletions(-)

diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index b5ca840d..40853de9 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -11,6 +11,7 @@
 import json
 import logging
 import os
+import re
 import sys
 
 import pkg_resources
@@ -23,6 +24,7 @@
     os.path.join(os.getcwd(), 'mlblocks_primitives'),    # legacy
     os.path.join(sys.prefix, 'mlblocks_primitives'),    # legacy
 ]
+
 _PIPELINES_PATHS = [
     os.path.join(os.getcwd(), 'mlpipelines'),
 ]
@@ -168,7 +170,7 @@ def get_primitives_paths():
             The list of folders.
     """
     paths = _load_entry_points('primitives') + _load_entry_points('jsons_path', 'mlprimitives')
-    return _PRIMITIVES_PATHS + paths
+    return _PRIMITIVES_PATHS + list(set(paths))
 
 
 def get_pipelines_paths():
@@ -228,6 +230,9 @@ def _load(name, paths):
                     return json.load(json_file)
 
 
+_PRIMITIVES = dict()
+
+
 def load_primitive(name):
     """Locate and load the primitive JSON annotation.
 
@@ -247,13 +252,20 @@ def load_primitive(name):
         ValueError:
             A ``ValueError`` will be raised if the primitive cannot be found.
     """
-    primitive = _load(name, get_primitives_paths())
-    if not primitive:
-        raise ValueError("Unknown primitive: {}".format(name))
+    primitive = _PRIMITIVES.get(name)
+    if primitive is None:
+        primitive = _load(name, get_primitives_paths())
+        if primitive is None:
+            raise ValueError("Unknown primitive: {}".format(name))
+
+        _PRIMITIVES[name] = primitive
 
     return primitive
 
 
+_PIPELINES = dict()
+
+
 def load_pipeline(name):
     """Locate and load the pipeline JSON annotation.
 
@@ -273,8 +285,59 @@ def load_pipeline(name):
         ValueError:
             A ``ValueError`` will be raised if the pipeline cannot be found.
     """
-    pipeline = _load(name, get_pipelines_paths())
-    if not pipeline:
-        raise ValueError("Unknown pipeline: {}".format(name))
+    pipeline = _PIPELINES.get(name)
+    if pipeline is None:
+        pipeline = _load(name, get_pipelines_paths())
+        if pipeline is None:
+            raise ValueError("Unknown pipeline: {}".format(name))
+
+        _PIPELINES[name] = pipeline
 
     return pipeline
+
+
+def _search_annotations(base_path, pattern, parts=None):
+    annotations = dict()
+    parts = parts or list()
+    if os.path.exists(base_path):
+        for name in os.listdir(base_path):
+            path = os.path.abspath(os.path.join(base_path, name))
+            if os.path.isdir(path):
+                annotations.update(_search_annotations(path, pattern, parts + [name]))
+            elif path not in annotations:
+                name = '.'.join(parts + [name])
+                if pattern.search(name) and name.endswith('.json'):
+                    annotations[path] = name[:-5]
+
+    return annotations
+
+
+def _get_annotations_list(paths, loader, pattern, **metadata_filters):
+    pattern = re.compile(pattern)
+    annotations = dict()
+    for base_path in paths:
+        annotations.update(_search_annotations(base_path, pattern))
+
+    matching = list()
+    for name in sorted(annotations.values()):
+        annotation = loader(name)
+        metadata = annotation.get('metadata', dict())
+        for key, value in metadata_filters.items():
+            metadata_value = metadata.get(key, '')
+            if not re.search(value, metadata_value):
+                break
+
+        else:
+            matching.append(name)
+
+    return matching
+
+
+def get_primitives_list(pattern='', **metadata_filters):
+    return _get_annotations_list(
+        get_primitives_paths(), load_primitive, pattern, **metadata_filters)
+
+
+def get_pipelines_list(pattern='', **metadata_filters):
+    return _get_annotations_list(
+        get_pipelines_paths(), load_pipeline, pattern, **metadata_filters)
diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 66bbf8fe..6370b4cf 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -27,6 +27,8 @@ class MLBlock():
     Attributes:
         name (str):
             Name given to this MLBlock.
+        metadata (dict):
+            Additional information about this primitive
         primitive (object):
             the actual function or instance which this MLBlock wraps.
         fit_args (dict):
@@ -143,22 +145,22 @@ def _get_tunable(cls, hyperparameters, init_params):
     def __init__(self, name, **kwargs):
         self.name = name
 
-        metadata = load_primitive(name)
+        primitive = load_primitive(name)
 
-        self.primitive = import_object(metadata['primitive'])
+        self.primitive = import_object(primitive['primitive'])
 
-        self._fit = metadata.get('fit', dict())
+        self._fit = primitive.get('fit', dict())
         self.fit_args = self._fit.get('args', [])
         self.fit_method = self._fit.get('method')
 
-        self._produce = metadata['produce']
+        self._produce = primitive['produce']
         self.produce_args = self._produce['args']
         self.produce_output = self._produce['output']
         self.produce_method = self._produce.get('method')
 
         self._class = bool(self.produce_method)
 
-        hyperparameters = metadata.get('hyperparameters', dict())
+        hyperparameters = primitive.get('hyperparameters', dict())
         init_params, fit_params, produce_params = self._extract_params(kwargs, hyperparameters)
 
         self._hyperparameters = init_params
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index 59bd4404..3681611b 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -11,7 +11,7 @@
 
 from mlblocks import discovery
 
-FAKE_MLPRIMITIVES_PATH = 'this/is/a/fake'
+FAKE_PRIMITIVES_PATH = 'this/is/a/fake'
 
 
 def test__add_lookup_path_do_nothing():
@@ -78,7 +78,7 @@ def test__load_entry_points_entry_points(iep_mock):
     primitives_ep = EntryPoint(
         'primitives',
         'tests.test_discovery',
-        attrs=['FAKE_MLPRIMITIVES_PATH'],
+        attrs=['FAKE_PRIMITIVES_PATH'],
         dist=Distribution()
     )
     iep_mock.return_value = [

From 467948e4088915eabbe2e6853e2d88408a10e96d Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 4 Jul 2019 17:57:48 -0400
Subject: [PATCH 032/160] Add support to work with hyperparameters in the
 format used by BTB

---
 mlblocks/mlpipeline.py | 103 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 100 insertions(+), 3 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index b31502ea..3c08f444 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -4,7 +4,9 @@
 
 import json
 import logging
-from collections import Counter, OrderedDict
+from collections import Counter, OrderedDict, defaultdict
+
+import numpy as np
 
 from mlblocks.discovery import load_pipeline
 from mlblocks.mlblock import MLBlock
@@ -161,18 +163,112 @@ def get_tunable_hyperparameters(self):
         """
         return self._tunable_hyperparameters.copy()
 
-    def get_hyperparameters(self):
+    @classmethod
+    def _sanitize_value(cls, value):
+        """Convert numpy values to their python primitive type equivalent.
+
+        If a value is a dict, recursively sanitize its values.
+
+        Args:
+            value:
+                value to sanitize.
+
+        Returns:
+            sanitized value.
+        """
+        if isinstance(value, dict):
+            return {
+                key: cls._sanitize_value(value)
+                for key, value in value.items()
+            }
+        if isinstance(value, np.integer):
+            return int(value)
+        elif isinstance(value, np.floating):
+            return float(value)
+        elif isinstance(value, np.ndarray):
+            return value.tolist()
+        elif isinstance(value, np.bool_):
+            return bool(value)
+        elif value == 'None':
+            return None
+
+        return value
+
+    @classmethod
+    def _sanitize(cls, hyperparameters):
+        """Convert tuple hyperparameter keys to nested dicts.
+
+        Also convert numpy types to primary python types.
+
+        The input hyperparameters dict can specify them in two formats:
+
+        One is the native MLBlocks format, where each key is the name of a block and each value
+        is a dict containing a complete hyperparameter specification for that block::
+
+            {
+                "block_name": {
+                    "hyperparameter_name": "hyperparameter_value",
+                    ...
+                },
+                ...
+            }
+
+        The other one is an alternative format where each key is a two element tuple containing
+        the name of the block as the first element and the name of the hyperparameter as the
+        second one::
+
+            {
+                ("block_name", "hyperparameter_name"): "hyperparameter_value",
+                ...
+            }
+
+
+        Args:
+            hyperparaeters (dict):
+                hyperparameters dict to sanitize.
+
+        Returns:
+            dict:
+                Sanitized dict.
+        """
+        params_tree = defaultdict(dict)
+        for key, value in hyperparameters.items():
+            value = cls._sanitize_value(value)
+            if isinstance(key, tuple):
+                block, hyperparameter = key
+                params_tree[block][hyperparameter] = value
+            else:
+                params_tree[key] = value
+
+        return params_tree
+
+    def get_hyperparameters(self, flat=False):
         """Get the current hyperparamters of each block.
 
+        Args:
+            flat (bool): If True, return a flattened dictionary where each key
+                is a two elements tuple containing the name of the block as the first
+                element and the name of the hyperparameter as the second one.
+                If False (default), return a dictionary where each key is the name of
+                a block and each value is a dictionary containing the complete
+                hyperparameter specification of that block.
+
         Returns:
             dict:
                 A dictionary containing the block names as keys and
                 the current block hyperparameters dictionary as values.
         """
-        hyperparameters = {}
+        hyperparameters = dict()
         for block_name, block in self.blocks.items():
             hyperparameters[block_name] = block.get_hyperparameters()
 
+        if flat:
+            hyperparameters = {
+                (block, name): value
+                for block, block_hyperparameters in hyperparameters.items()
+                for name, value in block_hyperparameters.items()
+            }
+
         return hyperparameters
 
     def set_hyperparameters(self, hyperparameters):
@@ -183,6 +279,7 @@ def set_hyperparameters(self, hyperparameters):
                 A dictionary containing the block names as keys and the new hyperparameters
                 dictionary as values.
         """
+        hyperparameters = self._sanitize(hyperparameters)
         for block_name, block_hyperparams in hyperparameters.items():
             self.blocks[block_name].set_hyperparameters(block_hyperparams)
 

From 4a91c74badac8da64c02b2f25e935a734e71dcc0 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Fri, 5 Jul 2019 19:55:19 -0400
Subject: [PATCH 033/160] return flat tunables and add tests

---
 mlblocks/mlpipeline.py   |  30 +++++++---
 tests/test_mlpipeline.py | 117 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 138 insertions(+), 9 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 3c08f444..36b71b29 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -153,15 +153,35 @@ def __init__(self, pipeline=None, primitives=None, init_params=None,
         if hyperparameters:
             self.set_hyperparameters(hyperparameters)
 
-    def get_tunable_hyperparameters(self):
+    @staticmethod
+    def _flatten_dict(hyperparameters):
+        return {
+            (block, name): value
+            for block, block_hyperparameters in hyperparameters.items()
+            for name, value in block_hyperparameters.items()
+        }
+
+    def get_tunable_hyperparameters(self, flat=False):
         """Get the tunable hyperparamters of each block.
 
+        Args:
+            flat (bool): If True, return a flattened dictionary where each key
+                is a two elements tuple containing the name of the block as the first
+                element and the name of the hyperparameter as the second one.
+                If False (default), return a dictionary where each key is the name of
+                a block and each value is a dictionary containing the complete
+                hyperparameter specification of that block.
+
         Returns:
             dict:
                 A dictionary containing the block names as keys and
                 the block tunable hyperparameters dictionary as values.
         """
-        return self._tunable_hyperparameters.copy()
+        tunables = self._tunable_hyperparameters.copy()
+        if flat:
+            tunables = self._flatten_dict(tunables)
+
+        return tunables
 
     @classmethod
     def _sanitize_value(cls, value):
@@ -263,11 +283,7 @@ def get_hyperparameters(self, flat=False):
             hyperparameters[block_name] = block.get_hyperparameters()
 
         if flat:
-            hyperparameters = {
-                (block, name): value
-                for block, block_hyperparameters in hyperparameters.items()
-                for name, value in block_hyperparameters.items()
-            }
+            hyperparameters = self._flatten_dict(hyperparameters)
 
         return hyperparameters
 
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 741be194..906c2c61 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -85,9 +85,72 @@ def test_get_tunable_hyperparameters(self):
         assert returned == tunable
         assert returned is not tunable
 
+    def test_get_tunable_hyperparameters_flat(self):
+        mlpipeline = MLPipeline(list())
+        tunable = {
+            'block_1': {
+                'hp_1': {
+                    'type': 'int',
+                    'range': [
+                        1,
+                        10
+                    ],
+                }
+            },
+            'block_2': {
+                'hp_1': {
+                    'type': 'str',
+                    'default': 'a',
+                    'values': [
+                        'a',
+                        'b',
+                        'c'
+                    ],
+                },
+                'hp_2': {
+                    'type': 'bool',
+                    'default': True,
+                }
+            }
+        }
+        mlpipeline._tunable_hyperparameters = tunable
+
+        returned = mlpipeline.get_tunable_hyperparameters(flat=True)
+
+        expected = {
+            ('block_1', 'hp_1'): {
+                'type': 'int',
+                'range': [
+                    1,
+                    10
+                ],
+            },
+            ('block_2', 'hp_1'): {
+                'type': 'str',
+                'default': 'a',
+                'values': [
+                    'a',
+                    'b',
+                    'c'
+                ],
+            },
+            ('block_2', 'hp_2'): {
+                'type': 'bool',
+                'default': True,
+            }
+        }
+        assert returned == expected
+
     def test_get_hyperparameters(self):
         block_1 = Mock()
+        block_1.get_hyperparameters.return_value = {
+            'a': 'a'
+        }
         block_2 = Mock()
+        block_2.get_hyperparameters.return_value = {
+            'b': 'b',
+            'c': 'c',
+        }
         blocks = OrderedDict((
             ('a.primitive.Name#1', block_1),
             ('a.primitive.Name#2', block_2),
@@ -98,8 +161,40 @@ def test_get_hyperparameters(self):
         hyperparameters = mlpipeline.get_hyperparameters()
 
         assert hyperparameters == {
-            'a.primitive.Name#1': block_1.get_hyperparameters.return_value,
-            'a.primitive.Name#2': block_2.get_hyperparameters.return_value,
+            'a.primitive.Name#1': {
+                'a': 'a',
+            },
+            'a.primitive.Name#2': {
+                'b': 'b',
+                'c': 'c',
+            },
+        }
+        block_1.get_hyperparameters.assert_called_once_with()
+        block_2.get_hyperparameters.assert_called_once_with()
+
+    def test_get_hyperparameters_flat(self):
+        block_1 = Mock()
+        block_1.get_hyperparameters.return_value = {
+            'a': 'a'
+        }
+        block_2 = Mock()
+        block_2.get_hyperparameters.return_value = {
+            'b': 'b',
+            'c': 'c',
+        }
+        blocks = OrderedDict((
+            ('a.primitive.Name#1', block_1),
+            ('a.primitive.Name#2', block_2),
+        ))
+        mlpipeline = MLPipeline(list())
+        mlpipeline.blocks = blocks
+
+        hyperparameters = mlpipeline.get_hyperparameters(flat=True)
+
+        assert hyperparameters == {
+            ('a.primitive.Name#1', 'a'): 'a',
+            ('a.primitive.Name#2', 'b'): 'b',
+            ('a.primitive.Name#2', 'c'): 'c',
         }
         block_1.get_hyperparameters.assert_called_once_with()
         block_2.get_hyperparameters.assert_called_once_with()
@@ -124,6 +219,24 @@ def test_set_hyperparameters(self):
         block_1.set_hyperparameters.assert_not_called()
         block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'})
 
+    def test_set_hyperparameters_flat(self):
+        block_1 = Mock()
+        block_2 = Mock()
+        blocks = OrderedDict((
+            ('a.primitive.Name#1', block_1),
+            ('a.primitive.Name#2', block_2),
+        ))
+        mlpipeline = MLPipeline(list())
+        mlpipeline.blocks = blocks
+
+        hyperparameters = {
+            ('a.primitive.Name#2', 'some'): 'arg'
+        }
+        mlpipeline.set_hyperparameters(hyperparameters)
+
+        block_1.set_hyperparameters.assert_not_called()
+        block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'})
+
     def test__get_block_args(self):
         pass
 

From 5bd5a709f853b06b564bd607ee904ea9f95269c9 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Fri, 5 Jul 2019 20:17:45 -0400
Subject: [PATCH 034/160] Fix setuptools version to fix dependency issues on
 tests

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9fca4dfa..9c7b3d2e 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,8 @@
     'pytest>=3.4.2',
     'pytest-cov>=2.6.0',
     'mlprimitives>=0.1.3,<0.2',
-    'urllib3>=1.20,<1.25'
+    'urllib3>=1.20,<1.25',
+    'setuptools>=41.0.0'
 ]
 
 

From 4dcf6022a78ca7230c7c0f714bd7185fdc4dd195 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 8 Jul 2019 13:29:25 -0400
Subject: [PATCH 035/160] Add docs for intermediate outputs

---
 docs/advanced_usage/pipelines.rst | 82 ++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/docs/advanced_usage/pipelines.rst b/docs/advanced_usage/pipelines.rst
index 33d57cdc..e87a0067 100644
--- a/docs/advanced_usage/pipelines.rst
+++ b/docs/advanced_usage/pipelines.rst
@@ -271,7 +271,7 @@ Like primitives, Pipelines can also be annotated and stored as dicts or JSON fil
 the different arguments expected by the ``MLPipeline`` class, as well as the set hyperparameters
 and tunable hyperparameters.
 
-Representing a  Pipeline as a dict
+Representing a Pipeline as a dict
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The dict representation of an Pipeline can be obtained directly from an ``MLPipeline`` instance,
@@ -344,6 +344,86 @@ that allows loading the pipeline directly from a JSON file:
 
     pipeline = MLPipeline.load('pipeline.json')
 
+
+Intermediate Outputs and Partial Execution
+------------------------------------------
+
+Sometimes we might be interested in capturing an intermediate output within a
+pipeline execution in order to inspect it, for debugging purposes, or to reuse
+it later on in order to speed up a tuning process where the pipeline needs
+to be executed multiple times over the same data.
+
+For this, two special arguments have been included in the ``fit`` and ``predict``
+methods of an MLPipeline:
+
+output\_
+~~~~~~~~
+
+The ``output_`` argument indicates which block within the pipeline we are interested
+in taking the output values from. This, implicitly, indicates up to which block the
+pipeline needs to be executed within ``fit`` and ``predict`` before returning.
+
+The ``output_`` argument is optional, and it can either be ``None``, which is the default,
+and Integer or a String.
+
+And its format is as follows:
+
+* If it is ``None`` (default), the ``fit`` method will return nothing and the
+  ``predict`` method will return the output of the last block in the pipeline.
+* If an integer is given, it is interpreted as the block index, starting on 0,
+  and the whole context after executing the specified block will be returned.
+  In case of ``fit``, this means that the outputs will be returned after fitting
+  a block and then producing it on the same data.
+* If it is a string, it can be interpreted in three ways:
+
+    * **block name**: If the string matches a block name exactly, including
+      its hash and counter number ``#n`` at the end, the whole context will be
+      returned after that block is produced.
+    * **variable_name**: If the string does not match any block name and does
+      not contain any dot character, ``'.'``, it will be considered a variable
+      name. In this case, the indicated variable will be extracted from the
+      context and returned after the last block has been produced.
+    * **block_name + variable_name**: If the complete string does not match a
+      block name but it contains at least one dot, ``'.'``, it will be split
+      in two parts on the last dot. If the first part of the string matches a
+      block name exactly, the second part of the string will be considered a
+      variable name, assuming the format ``{block_name}.{variable_name}``, and
+      the indicated variable will be extracted from the context and returned
+      after the block has been produced. Otherwise, if the extracted
+      ``block_name`` does not match a block name exactly, a ``ValueError``
+      will be raised.
+
+start\_
+~~~~~~~
+
+The ``start_`` argument indicates which block within the pipeline we are interested
+in starting the computation from when executing ``fit`` and ``predict``, allowing us
+to skip some of the initial blocks.
+
+The ``start_`` argument is optional, and it can either be ``None``, which is the default,
+and Integer or a String.
+
+And its format is as follows:
+
+* If it is ``None``, the execution will start on the first block.
+* If it is an integer, it is interpreted as the block index
+* If it is a string, it is expected to be the name of the block, including the counter
+  number at the end.
+
+This is specially useful when used in combination with the ``output_`` argument, as it
+effectively allows us to both capture intermediate outputs for debugging purposes or
+reusing intermediate states of the pipeline to accelerate tuning processes.
+
+An example of this situation, where we want to reuse the output of the first block, could be::
+
+    context_0 = pipeline.fit(X_train, y_train, output_=0)
+
+    # Afterwards, within the tuning loop
+    pipeline.fit(start_=1, **context_0)
+    predictions = pipeline.predict(X_test)
+    score = compute_score(y_test, predictions)
+
+
 .. _API Reference: ../api_reference.html
 .. _primitives: ../primitives.html
 .. _mlblocks.MLPipeline: ../api_reference.html#mlblocks.MLPipeline

From f93c8b155e6c17cc589bac2a6364e0db7443927d Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 8 Jul 2019 14:37:57 -0400
Subject: [PATCH 036/160] Add release notes for v0.3.1

---
 HISTORY.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index a312c9cb..e6b14674 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,20 @@
 Changelog
 =========
 
+0.3.1 - Pipelines Discovery
+---------------------------
+
+* Support flat hyperparameter dictionaries
+  [Issue #92](https://github.com/HDI-Project/MLBlocks/issues/92) by @csala
+* Load pipelines by name and register them as `entry_points`
+  [Issue #88](https://github.com/HDI-Project/MLBlocks/issues/88) by @csala
+* Implement partial re-fit
+  [Issue #61](https://github.com/HDI-Project/MLBlocks/issues/61) by @csala
+* Move argument parsing to MLBlock
+  [Issue #86](https://github.com/HDI-Project/MLBlocks/issues/86) by @csala
+* Allow getting intermediate outputs
+  [Issue #58](https://github.com/HDI-Project/MLBlocks/issues/58) by @csala
+
 0.3.0 - New Primitives Discovery
 --------------------------------
 

From 0d3ba9245e93a83f6a5d674e4cf84917ec3f898b Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 8 Jul 2019 14:38:04 -0400
Subject: [PATCH 037/160] =?UTF-8?q?Bump=20version:=200.3.1-dev=20=E2=86=92?=
 =?UTF-8?q?=200.3.1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 7 ++++---
 setup.py             | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 37199013..b47c8962 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.3.1-dev'
+__version__ = '0.3.1'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 17244565..d4103297 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,15 +1,15 @@
 [bumpversion]
-current_version = 0.3.1-dev
+current_version = 0.3.1
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
-serialize =
+serialize = 
 	{major}.{minor}.{patch}-{release}
 	{major}.{minor}.{patch}
 
 [bumpversion:part:release]
 optional_value = release
-values =
+values = 
 	dev
 	release
 
@@ -51,3 +51,4 @@ max-line-length = 99
 
 [pydocstyle]
 add-ignore = D403,D413,D105,D107
+
diff --git a/setup.py b/setup.py
index 9c7b3d2e..3f01d72e 100644
--- a/setup.py
+++ b/setup.py
@@ -98,6 +98,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/HDI-Project/MLBlocks',
-    version='0.3.1-dev',
+    version='0.3.1',
     zip_safe=False,
 )

From 28a9a44373d10cd0b8e41ead686889535a4b7269 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 8 Jul 2019 14:38:21 -0400
Subject: [PATCH 038/160] =?UTF-8?q?Bump=20version:=200.3.1=20=E2=86=92=200?=
 =?UTF-8?q?.3.2-dev?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index b47c8962..b528aefe 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.3.1'
+__version__ = '0.3.2-dev'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index d4103297..1967b27b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.1
+current_version = 0.3.2-dev
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
diff --git a/setup.py b/setup.py
index 3f01d72e..98350606 100644
--- a/setup.py
+++ b/setup.py
@@ -98,6 +98,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/HDI-Project/MLBlocks',
-    version='0.3.1',
+    version='0.3.2-dev',
     zip_safe=False,
 )

From 677ef256ef5e23c4abfe52b8b5a2f839bf5cdf1d Mon Sep 17 00:00:00 2001
From: Kalyan Veeramachaneni <kveerama.su@gmail.com>
Date: Sun, 14 Jul 2019 19:01:25 -0700
Subject: [PATCH 039/160] Update README.md

---
 README.md | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 01629dc8..5b4f2519 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,16 @@
-<p align="center">
-<img width=30% src="https://dai.lids.mit.edu/wp-content/uploads/2018/06/mlblocks-icon.png" alt=“MLBlocks” />
+<p align="left">
+<img width=15% src="https://dai.lids.mit.edu/wp-content/uploads/2018/06/Logo_DAI_highres.png" alt=“MLBlocksr” />
+<i>An open source project from Data to AI Lab at MIT.</i>
 </p>
 
-<p align="center">
-<i>
+
+
+<p align="left">
+<img width=20% src="https://dai.lids.mit.edu/wp-content/uploads/2018/06/mlblocks-icon.png" alt=“MLBlocks” />
+</p>
+
+<p align="left">
 Pipelines and Primitives for Machine Learning and Data Science.
-</i>
 </p>
 
 [![PyPi][pypi-img]][pypi-url]

From 98b4d245c5cefc68f1ce3d1a7217f961dfe3378c Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 17 Jul 2019 14:29:07 +0200
Subject: [PATCH 040/160] Isolate block hyperparams from primitives

---
 mlblocks/mlblock.py   |  9 +++++----
 tests/test_mlblock.py | 46 +++++++++++++++++++++++++++++++++----------
 2 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 66bbf8fe..fa67bd6b 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -4,6 +4,7 @@
 
 import importlib
 import logging
+from copy import deepcopy
 
 from mlblocks.discovery import load_primitive
 
@@ -192,7 +193,7 @@ def get_tunable_hyperparameters(self):
                 tuned, their types and, if applicable, the accepted
                 ranges or values.
         """
-        return self._tunable.copy()
+        return deepcopy(self._tunable)
 
     def get_hyperparameters(self):
         """Get hyperparameters values that the current MLBlock is using.
@@ -202,7 +203,7 @@ def get_hyperparameters(self):
                 the dictionary containing the hyperparameter values that the
                 MLBlock is currently using.
         """
-        return self._hyperparameters.copy()
+        return deepcopy(self._hyperparameters)
 
     def set_hyperparameters(self, hyperparameters):
         """Set new hyperparameters.
@@ -221,7 +222,7 @@ def set_hyperparameters(self, hyperparameters):
 
         if self._class:
             LOGGER.debug('Creating a new primitive instance for %s', self.name)
-            self.instance = self.primitive(**self._hyperparameters)
+            self.instance = self.primitive(**self.get_hyperparameters())
 
     def _get_method_kwargs(self, kwargs, method_args):
         """Prepare the kwargs for the method.
@@ -307,5 +308,5 @@ def produce(self, **kwargs):
         if self._class:
             return getattr(self.instance, self.produce_method)(**produce_kwargs)
 
-        produce_kwargs.update(self._hyperparameters)
+        produce_kwargs.update(self.get_hyperparameters())
         return self.primitive(**produce_kwargs)
diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py
index 5273d40c..16f1c6d1 100644
--- a/tests/test_mlblock.py
+++ b/tests/test_mlblock.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 from unittest import TestCase
-from unittest.mock import patch
+from unittest.mock import MagicMock, Mock, patch
 
 from mlblocks.mlblock import MLBlock, import_object
 
@@ -403,27 +403,53 @@ def test_get_tunable_hyperparameters(self, load_primitive_mock, import_object_mo
         assert returned == tunable
         assert returned is not tunable
 
+    @patch('mlblocks.mlblock.import_object', new=Mock())
+    @patch('mlblocks.mlblock.load_primitive', new=MagicMock())
+    def test_get_hyperparameters(self):
+        """get_hyperparameters has to return a deepcopy of the _hyperparameters attribute."""
+        mlblock = MLBlock('given_primitive_name')
+
+        hyperparameters = {
+            'a_list_param': ['a']
+        }
+        mlblock._hyperparameters = hyperparameters
+
+        returned = mlblock.get_hyperparameters()
+
+        assert returned == hyperparameters
+        assert returned is not hyperparameters
+
+        returned['a_list_param'].append('b')
+        assert 'b' not in hyperparameters['a_list_param']
+
     @patch('mlblocks.mlblock.import_object')
     @patch('mlblocks.mlblock.load_primitive')
-    def test_get_hyperparameters(self, load_primitive_mock, import_object_mock):
-        """get_hyperparameters has to return a copy of the _hyperparameters attribute."""
-        load_primitive_mock.return_value = {
-            'primitive': 'a_primitive_name',
+    def test_modify_hyperparameters(self, lp_mock, io_mock):
+        """If a primitive method modifies the hyperparameters, changes should not persist."""
+
+        def primitive(a_list_param):
+            a_list_param.append('b')
+
+        io_mock.return_value = primitive
+
+        lp_mock.return_value = {
+            'primitive': 'a_primitive',
             'produce': {
                 'args': [],
                 'output': []
             }
         }
 
-        mlblock = MLBlock('given_primitive_name')
+        mlblock = MLBlock('a_primitive')
 
-        hyperparameters = dict()
+        hyperparameters = {
+            'a_list_param': ['a']
+        }
         mlblock._hyperparameters = hyperparameters
 
-        returned = mlblock.get_hyperparameters()
+        mlblock.produce()
 
-        assert returned == hyperparameters
-        assert returned is not hyperparameters
+        assert 'b' not in hyperparameters['a_list_param']
 
     def test_set_hyperparameters_function(self):
         pass

From 735f48d02f2d73f019d9623fcfcc0920abfb6904 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 17 Jul 2019 15:07:39 +0200
Subject: [PATCH 041/160] Add fit and produce default arg values

---
 mlblocks/mlpipeline.py   |  4 ++++
 tests/test_mlpipeline.py | 37 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 36b71b29..e19a68ee 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -330,6 +330,10 @@ def _get_block_args(self, block_name, block_args, context):
 
             if variable in context:
                 kwargs[name] = context[variable]
+            elif 'default' in arg:
+                kwargs[name] = arg['default']
+            elif arg.get('required', True):
+                raise ValueError('Input variable {} not found in context'.format(variable))
 
         return kwargs
 
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 906c2c61..2011f5ae 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -238,7 +238,42 @@ def test_set_hyperparameters_flat(self):
         block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'})
 
     def test__get_block_args(self):
-        pass
+        input_names = {
+            'a_block': {
+                'arg_3': 'arg_3_alt'
+            }
+        }
+        pipeline = MLPipeline(list(), input_names=input_names)
+
+        block_args = [
+            {
+                'name': 'arg_1',
+            },
+            {
+                'name': 'arg_2',
+                'default': 'arg_2_value'
+            },
+            {
+                'name': 'arg_3',
+            },
+            {
+                'name': 'arg_4',
+                'required': False
+            },
+        ]
+        context = {
+            'arg_1': 'arg_1_value',
+            'arg_3_alt': 'arg_3_value'
+        }
+
+        args = pipeline._get_block_args('a_block', block_args, context)
+
+        expected = {
+            'arg_1': 'arg_1_value',
+            'arg_2': 'arg_2_value',
+            'arg_3': 'arg_3_value',
+        }
+        assert args == expected
 
     def test__get_outputs(self):
         pass

From 2662fea39476dfc30914a9ded59caecdfe51ad0c Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Fri, 26 Jul 2019 19:03:12 +0200
Subject: [PATCH 042/160] Fix dependencies

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 98350606..0d9f766b 100644
--- a/setup.py
+++ b/setup.py
@@ -46,6 +46,7 @@
     'ipython>=6.5.0',
     'matplotlib>=2.2.3',
     'autodocsumm>=0.1.10',
+    'docutils<0.15,>=0.10',    # botocore incompatibility with 0.15
 
     # style check
     'flake8>=3.5.0',

From ae6ab0983b10598a214fe9af2eb25e18a7442a5e Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Fri, 26 Jul 2019 23:22:25 +0200
Subject: [PATCH 043/160] Fix testing dependencies

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0d9f766b..608e481d 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,8 @@
     'pytest-cov>=2.6.0',
     'mlprimitives>=0.1.3,<0.2',
     'urllib3>=1.20,<1.25',
-    'setuptools>=41.0.0'
+    'setuptools>=41.0.0',
+    'numpy<1.17',
 ]
 
 

From cd005af297f72cc5b6cb6b29228de14de992b920 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 6 Aug 2019 14:03:10 +0200
Subject: [PATCH 044/160] Flexible filter searching

---
 mlblocks/discovery.py | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index 40853de9..6d85f970 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -312,7 +312,23 @@ def _search_annotations(base_path, pattern, parts=None):
     return annotations
 
 
-def _get_annotations_list(paths, loader, pattern, **metadata_filters):
+def _match_filter(annotation, key, value):
+    if '.' in key:
+        name, key = key.split('.', 1)
+        part = annotation.get(name) or dict()
+        return _match_filter(part, key, value)
+
+    annotation_value = annotation.get(key)
+    if not isinstance(annotation_value, type(value)):
+        if isinstance(annotation_value, (list, dict)):
+            return value in annotation_value
+        elif isinstance(value, (list, dict)):
+            return annotation_value in value
+
+    return annotation_value == value
+
+
+def _get_annotations_list(paths, loader, pattern, filters):
     pattern = re.compile(pattern)
     annotations = dict()
     for base_path in paths:
@@ -321,10 +337,8 @@ def _get_annotations_list(paths, loader, pattern, **metadata_filters):
     matching = list()
     for name in sorted(annotations.values()):
         annotation = loader(name)
-        metadata = annotation.get('metadata', dict())
-        for key, value in metadata_filters.items():
-            metadata_value = metadata.get(key, '')
-            if not re.search(value, metadata_value):
+        for key, value in filters.items():
+            if not _match_filter(annotation, key, value):
                 break
 
         else:
@@ -333,11 +347,11 @@ def _get_annotations_list(paths, loader, pattern, **metadata_filters):
     return matching
 
 
-def get_primitives_list(pattern='', **metadata_filters):
-    return _get_annotations_list(
-        get_primitives_paths(), load_primitive, pattern, **metadata_filters)
+def get_primitives_list(pattern='', filters=None):
+    filters = filters or dict()
+    return _get_annotations_list(get_primitives_paths(), load_primitive, pattern, filters)
 
 
-def get_pipelines_list(pattern='', **metadata_filters):
-    return _get_annotations_list(
-        get_pipelines_paths(), load_pipeline, pattern, **metadata_filters)
+def get_pipelines_list(pattern='', filters=None):
+    filters = filters or dict()
+    return _get_annotations_list(get_pipelines_paths(), load_pipeline, pattern, filters)

From 82ef5b53bd5ccd54c8971ae64479bc79d64f35ba Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 6 Aug 2019 21:04:13 +0200
Subject: [PATCH 045/160] Rename find_primitives and add tests

---
 mlblocks/discovery.py   |  10 +-
 tests/test_discovery.py | 201 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 203 insertions(+), 8 deletions(-)

diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index 6d85f970..db7ba40d 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -312,11 +312,11 @@ def _search_annotations(base_path, pattern, parts=None):
     return annotations
 
 
-def _match_filter(annotation, key, value):
+def _match(annotation, key, value):
     if '.' in key:
         name, key = key.split('.', 1)
         part = annotation.get(name) or dict()
-        return _match_filter(part, key, value)
+        return _match(part, key, value)
 
     annotation_value = annotation.get(key)
     if not isinstance(annotation_value, type(value)):
@@ -338,7 +338,7 @@ def _get_annotations_list(paths, loader, pattern, filters):
     for name in sorted(annotations.values()):
         annotation = loader(name)
         for key, value in filters.items():
-            if not _match_filter(annotation, key, value):
+            if not _match(annotation, key, value):
                 break
 
         else:
@@ -347,11 +347,11 @@ def _get_annotations_list(paths, loader, pattern, filters):
     return matching
 
 
-def get_primitives_list(pattern='', filters=None):
+def find_primitives(pattern='', filters=None):
     filters = filters or dict()
     return _get_annotations_list(get_primitives_paths(), load_primitive, pattern, filters)
 
 
-def get_pipelines_list(pattern='', filters=None):
+def find_pipelines(pattern='', filters=None):
     filters = filters or dict()
     return _get_annotations_list(get_pipelines_paths(), load_pipeline, pattern, filters)
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index 3681611b..07fc0753 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -2,9 +2,10 @@
 
 import json
 import os
+import re
 import tempfile
 import uuid
-from unittest.mock import call, patch
+from unittest.mock import Mock, call, patch
 
 import pytest
 from pkg_resources import Distribution, EntryPoint
@@ -12,6 +13,10 @@
 from mlblocks import discovery
 
 FAKE_PRIMITIVES_PATH = 'this/is/a/fake'
+FAKE_PRIMITIVES_PATHS = [
+    'this/is/another/fake',
+    'this/is/yet/another/fake',
+]
 
 
 def test__add_lookup_path_do_nothing():
@@ -81,9 +86,16 @@ def test__load_entry_points_entry_points(iep_mock):
         attrs=['FAKE_PRIMITIVES_PATH'],
         dist=Distribution()
     )
+    another_primitives_ep = EntryPoint(
+        'primitives',
+        'tests.test_discovery',
+        attrs=['FAKE_PRIMITIVES_PATHS'],
+        dist=Distribution()
+    )
     iep_mock.return_value = [
         something_else_ep,
-        primitives_ep
+        primitives_ep,
+        another_primitives_ep
     ]
 
     # run
@@ -91,7 +103,9 @@ def test__load_entry_points_entry_points(iep_mock):
 
     # assert
     expected = [
-        'this/is/a/fake'
+        'this/is/a/fake',
+        'this/is/another/fake',
+        'this/is/yet/another/fake',
     ]
     assert paths == expected
 
@@ -196,3 +210,184 @@ def test__load_pipeline_success(load_mock, gpp_mock):
     load_mock.assert_called_once_with('valid.pipeline', ['a', 'b'])
 
     assert pipeline == load_mock.return_value
+
+
+@patch('mlblocks.discovery.os')
+def test__search_annotations(os_mock):
+    os_mock.path.abspath = os.path.abspath
+    os_mock.path.join = os.path.join
+    os_mock.path.exists.return_value = True
+    os_mock.listdir.side_effect = [
+        [
+            'a.primitive.json',
+            'another.primitive.json',
+            'some',
+        ],
+        [
+            'other',
+        ],
+        [
+            'primitive.json'
+        ]
+    ]
+    os_mock.path.isdir.return_value = False
+    os_mock.path.isdir.side_effect = [
+        False,
+        False,
+        True,
+        True,
+        False
+    ]
+
+    pattern = re.compile('other')
+    annotations = discovery._search_annotations('/path/to', pattern)
+
+    assert annotations == {
+        '/path/to/another.primitive.json': 'another.primitive',
+        '/path/to/some/other/primitive.json': 'some.other.primitive'
+    }
+
+
+def test__match_no_match():
+    annotation = {
+        'name': 'a.primitive',
+    }
+
+    matches = discovery._match(annotation, 'key', 'value')
+
+    assert not matches
+
+
+def test__match_root():
+    annotation = {
+        'name': 'a.primitive',
+        'key': 'value'
+    }
+
+    matches = discovery._match(annotation, 'key', 'value')
+
+    assert matches
+
+
+def test__match_sublevel():
+    annotation = {
+        'name': 'a.primitive',
+        'some': {
+            'sublevel': {
+                'key': 'value'
+            }
+        }
+    }
+
+    matches = discovery._match(annotation, 'some.sublevel.key', 'value')
+
+    assert matches
+
+
+def test__match_list_no_match():
+    annotation = {
+        'name': 'a.primitive',
+        'key': [
+            'another_value'
+            'yet_another_value'
+        ]
+    }
+
+    matches = discovery._match(annotation, 'key', 'value')
+
+    assert not matches
+
+
+def test__match_list():
+    annotation = {
+        'name': 'a.primitive',
+        'key': [
+            'value',
+            'another_value'
+        ]
+    }
+
+    matches = discovery._match(annotation, 'key', 'value')
+
+    assert matches
+
+
+def test__match_dict():
+    annotation = {
+        'name': 'a.primitive',
+        'key': {
+            'value': 'subvalue',
+            'another_value': 'another_subvalue'
+        }
+    }
+
+    matches = discovery._match(annotation, 'key', 'value')
+
+    assert matches
+
+
+def test__match_multiple_keys():
+    annotation = {
+        'name': 'a.primitive',
+        'key': 'value'
+    }
+
+    matches = discovery._match(annotation, 'key', ['value', 'another_value'])
+
+    assert matches
+
+
+@patch('mlblocks.discovery._search_annotations')
+def test__get_annotations_list(search_annotations_mock):
+    search_annotations_mock.return_value = {
+        '/path/to/a/classifier.primitive.json': 'classifier.primitive',
+        '/path/to/a/regressor.primitive.json': 'regressor.primitive',
+    }
+
+    loader = Mock()
+    loader.side_effect = [
+        {
+            'name': 'classifier.primitive',
+            'classifiers': {
+                'type': 'estimator',
+                'subtype': 'classifier',
+            }
+        },
+        {
+            'name': 'regressor.primitive',
+            'classifiers': {
+                'type': 'estimator',
+                'subtype': 'regressor',
+            }
+        }
+    ]
+
+    filters = {
+        'classifiers.subtype': 'regressor'
+    }
+    annotations = discovery._get_annotations_list(['/a/path'], loader, 'pattern', filters)
+
+    assert annotations == ['regressor.primitive']
+    search_annotations_mock.assert_called_once_with('/a/path', re.compile('pattern'))
+
+
+@patch('mlblocks.discovery._get_annotations_list')
+@patch('mlblocks.discovery.get_primitives_paths')
+def test_find_primitives(gpp_mock, gal_mock):
+    primitives = discovery.find_primitives('pattern')
+
+    gal_mock.assert_called_once_with(
+        gpp_mock.return_value, discovery.load_primitive, 'pattern', dict())
+
+    assert primitives == gal_mock.return_value
+
+
+@patch('mlblocks.discovery._get_annotations_list')
+@patch('mlblocks.discovery.get_pipelines_paths')
+def test_find_primitives(gpp_mock, gal_mock):
+    primitives = discovery.find_pipelines('pattern', {'a': 'filter'})
+
+    gal_mock.assert_called_once_with(
+        gpp_mock.return_value, discovery.load_pipeline, 'pattern', {'a': 'filter'})
+
+    assert primitives == gal_mock.return_value

From 1ca63500c1c86fc973005ad2d3c2a768f685f13a Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 6 Aug 2019 21:08:44 +0200
Subject: [PATCH 046/160] rename method

---
 tests/test_discovery.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index 07fc0753..bf148571 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -384,7 +384,7 @@ def test_find_primitives(gpp_mock, gal_mock):
 
 @patch('mlblocks.discovery._get_annotations_list')
 @patch('mlblocks.discovery.get_pipelines_paths')
-def test_find_primitives(gpp_mock, gal_mock):
+def test_find_pipelines(gpp_mock, gal_mock):
     primitives = discovery.find_pipelines('pattern', {'a': 'filter'})
 
     gal_mock.assert_called_once_with(

From ec4609f45929defff7e64a09c81a866810774d4f Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 7 Aug 2019 12:10:34 +0200
Subject: [PATCH 047/160] Add docstrings and rename a few methods

---
 mlblocks/discovery.py   | 151 ++++++++++++++++++++++++++++++++++++----
 tests/test_discovery.py |  26 ++++---
 2 files changed, 149 insertions(+), 28 deletions(-)

diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index db7ba40d..9a1dbef5 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -297,6 +297,29 @@ def load_pipeline(name):
 
 
 def _search_annotations(base_path, pattern, parts=None):
+    """Search for annotations within the given path.
+
+    If the indicated path has subfolders, search recursively within them.
+
+    If a pattern is given, return only the annotations whose name
+    matches the pattern.
+
+    Args:
+        base_path (str):
+            path to the folder to be searched for annotations.
+        pattern (str):
+            Regular expression to search in the annotation names.
+        parts (list):
+            Optional. List containing the parent folders that are also part
+            of the annotation name. Used during recursion to be able to
+            build the final annotation name before returning it.
+
+    Returns:
+        dict:
+            dictionary containing paths as keys and annotation names as
+            values.
+    """
+    pattern = re.compile(pattern)
     annotations = dict()
     parts = parts or list()
     if os.path.exists(base_path):
@@ -312,24 +335,70 @@ def _search_annotations(base_path, pattern, parts=None):
     return annotations
 
 
-def _match(annotation, key, value):
-    if '.' in key:
-        name, key = key.split('.', 1)
-        part = annotation.get(name) or dict()
-        return _match(part, key, value)
+def _match(annotation, key, values):
+    """Check if the anotation has the key and it matches any of the values.
+
+    If the given key is not found but it contains dots, split by the dots
+    and consider each part a sublevel in the annotation.
+
+    If the key value within the annotation is a list or a dict, check
+    whether any of the given values is contained within it instead of
+    checking for equality.
+
+    Args:
+        annotation (dict):
+            Dictionary annotation.
+        key (str):
+            Key to search within the annoation. It can contain dots to
+            separated nested subdictionary levels within the annotation.
+        values (object or list):
+            Value or list of values to search for.
 
-    annotation_value = annotation.get(key)
-    if not isinstance(annotation_value, type(value)):
+    Returns:
+        bool:
+            whether there is a match or not.
+    """
+    if not isinstance(values, list):
+        values = [values]
+
+    if key not in annotation:
+        if '.' in key:
+            name, key = key.split('.', 1)
+            part = annotation.get(name) or dict()
+            return _match(part, key, values)
+        else:
+            return False
+
+    annotation_value = annotation[key]
+
+    for value in values:
         if isinstance(annotation_value, (list, dict)):
             return value in annotation_value
-        elif isinstance(value, (list, dict)):
-            return annotation_value in value
+        elif annotation_value == value:
+            return True
 
-    return annotation_value == value
+    return False
 
 
-def _get_annotations_list(paths, loader, pattern, filters):
-    pattern = re.compile(pattern)
+def _find_annotations(paths, loader, pattern, filters):
+    """Find matching annotations within the given paths.
+
+    Math annotations by both name pattern and filters.
+
+    Args:
+        paths (list):
+            List of paths to search annotations in.
+        loader (callable):
+            Function to use to load the annotation contents.
+        pattern (str):
+            Pattern to match against the annotation name.
+        filters (dict):
+            Dictionary containing key/value filters.
+
+    Returns:
+        list:
+            names of the matching annotations.
+    """
     annotations = dict()
     for base_path in paths:
         annotations.update(_search_annotations(base_path, pattern))
@@ -348,10 +417,64 @@ def _get_annotations_list(paths, loader, pattern, filters):
 
 
 def find_primitives(pattern='', filters=None):
+    """Find primitives by name and filters.
+
+    If a patter is given, only the primitives whose name matches
+    the pattern will be returned.
+
+    If filters are given, they should be a dictionary containing key/value
+    filters that will have to be matched within the primitive annotation
+    for it to be included in the results.
+
+    If the given key is not found but it contains dots, split by the dots
+    and consider each part a sublevel in the annotation.
+
+    If the key value within the annotation is a list or a dict, check
+    whether any of the given values is contained within it instead of
+    checking for equality.
+
+    Args:
+        pattern (str):
+            Regular expression to match agains the primitive names.
+        filters (dict):
+            Dictionary containing the filters to apply over the matchin
+            primitives.
+
+    Returns:
+        list:
+            Names of the matching primitives.
+    """
     filters = filters or dict()
-    return _get_annotations_list(get_primitives_paths(), load_primitive, pattern, filters)
+    return _find_annotations(get_primitives_paths(), load_primitive, pattern, filters)
 
 
 def find_pipelines(pattern='', filters=None):
+    """Find pipelines by name and filters.
+
+    If a patter is given, only the pipelines whose name matches
+    the pattern will be returned.
+
+    If filters are given, they should be a dictionary containing key/value
+    filters that will have to be matched within the pipeline annotation
+    for it to be included in the results.
+
+    If the given key is not found but it contains dots, split by the dots
+    and consider each part a sublevel in the annotation.
+
+    If the key value within the annotation is a list or a dict, check
+    whether any of the given values is contained within it instead of
+    checking for equality.
+
+    Args:
+        pattern (str):
+            Regular expression to match agains the pipeline names.
+        filters (dict):
+            Dictionary containing the filters to apply over the matchin
+            pipelines.
+
+    Returns:
+        list:
+            Names of the matching pipelines.
+    """
     filters = filters or dict()
-    return _get_annotations_list(get_pipelines_paths(), load_pipeline, pattern, filters)
+    return _find_annotations(get_pipelines_paths(), load_pipeline, pattern, filters)
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index bf148571..dc3eca87 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -2,7 +2,6 @@
 
 import json
 import os
-import re
 import tempfile
 import uuid
 from unittest.mock import Mock, call, patch
@@ -239,8 +238,7 @@ def test__search_annotations(os_mock):
         False
     ]
 
-    pattern = re.compile('other')
-    annotations = discovery._search_annotations('/path/to', pattern)
+    annotations = discovery._search_annotations('/path/to', 'other')
 
     assert annotations == {
         '/path/to/another.primitive.json': 'another.primitive',
@@ -338,7 +336,7 @@ def test__match_multiple_keys():
 
 
 @patch('mlblocks.discovery._search_annotations')
-def test__get_annotations_list(search_annotations_mock):
+def test__find_annotations(search_annotations_mock):
     search_annotations_mock.return_value = {
         '/path/to/a/classifier.primitive.json': 'classifier.primitive',
         '/path/to/a/regressor.primitive.json': 'regressor.primitive',
@@ -365,29 +363,29 @@ def test__get_annotations_list(search_annotations_mock):
     filters = {
         'classifiers.subtype': 'regressor'
     }
-    annotations = discovery._get_annotations_list(['/a/path'], loader, 'pattern', filters)
+    annotations = discovery._find_annotations(['/a/path'], loader, 'pattern', filters)
 
     assert annotations == ['regressor.primitive']
-    search_annotations_mock.assert_called_once_with('/a/path', re.compile('pattern'))
+    search_annotations_mock.assert_called_once_with('/a/path', 'pattern')
 
 
-@patch('mlblocks.discovery._get_annotations_list')
+@patch('mlblocks.discovery._find_annotations')
 @patch('mlblocks.discovery.get_primitives_paths')
-def test_find_primitives(gpp_mock, gal_mock):
+def test_find_primitives(gpp_mock, fa_mock):
     primitives = discovery.find_primitives('pattern')
 
-    gal_mock.assert_called_once_with(
+    fa_mock.assert_called_once_with(
         gpp_mock.return_value, discovery.load_primitive, 'pattern', dict())
 
-    assert primitives == gal_mock.return_value
+    assert primitives == fa_mock.return_value
 
 
-@patch('mlblocks.discovery._get_annotations_list')
+@patch('mlblocks.discovery._find_annotations')
 @patch('mlblocks.discovery.get_pipelines_paths')
-def test_find_pipelines(gpp_mock, gal_mock):
+def test_find_pipelines(gpp_mock, fa_mock):
     primitives = discovery.find_pipelines('pattern', {'a': 'filter'})
 
-    gal_mock.assert_called_once_with(
+    fa_mock.assert_called_once_with(
         gpp_mock.return_value, discovery.load_pipeline, 'pattern', {'a': 'filter'})
 
-    assert primitives == gal_mock.return_value
+    assert primitives == fa_mock.return_value

From 69a30cafcae4a776e8d1ed09c41116d1c82d2bee Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 7 Aug 2019 12:36:51 +0200
Subject: [PATCH 048/160] Update README

---
 README.md | 91 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 72 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 5b4f2519..2d49d8a6 100644
--- a/README.md
+++ b/README.md
@@ -13,19 +13,14 @@
 Pipelines and Primitives for Machine Learning and Data Science.
 </p>
 
-[![PyPi][pypi-img]][pypi-url]
-[![Travis][travis-img]][travis-url]
-[![CodeCov][codecov-img]][codecov-url]
-
-[pypi-img]: https://img.shields.io/pypi/v/mlblocks.svg
-[pypi-url]: https://pypi.python.org/pypi/mlblocks
-[travis-img]: https://travis-ci.org/HDI-Project/MLBlocks.svg?branch=master
-[travis-url]: https://travis-ci.org/HDI-Project/MLBlocks
-[codecov-img]: https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg
-[codecov-url]: https://codecov.io/gh/HDI-Project/MLBlocks
+[![PyPi](https://img.shields.io/pypi/v/mlblocks.svg)](https://pypi.python.org/pypi/mlblocks)
+[![Travis](https://travis-ci.org/HDI-Project/MLBlocks.svg?branch=master)](https://travis-ci.org/HDI-Project/MLBlocks)
+[![CodeCov](https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/HDI-Project/MLBlocks)
+[![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks)
 
 * Free software: MIT license
 * Documentation: https://HDI-Project.github.io/MLBlocks
+- Homepage: https://github.com/HDI-Project/MLBlocks
 
 # Overview
 
@@ -44,24 +39,82 @@ Features include:
   outputs per primitive.
 * Easy save and load Pipelines using JSON Annotations.
 
-# Installation
+# Install
+
+## Requirements
+
+**MLBlocks** has been developed and tested on [Python 3.5 and 3.6](https://www.python.org/downloads/)
+
+Also, although it is not strictly required, the usage of a
+[virtualenv](https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid
+interfering with other software installed in the system where **MLBlocks** is run.
+
+These are the minimum commands needed to create a virtualenv using python3.6 for **MLBlocks**:
+
+```bash
+pip install virtualenv
+virtualenv -p $(which python3.6) mlblocks-venv
+```
 
-The simplest and recommended way to install MLBlocks is using `pip`:
+Afterwards, you have to execute this command to have the virtualenv activated:
+
+```bash
+source mlblocks-venv/bin/activate
+```
+
+Remember about executing it every time you start a new console to work on **MLBlocks**!
+
+## Install with pip
+
+After creating the virtualenv and activating it, we recommend using
+[pip](https://pip.pypa.io/en/stable/) in order to install **MLBlocks**:
 
 ```bash
 pip install mlblocks
 ```
 
-Alternatively, you can also clone the repository and install it from sources
+This will pull and install the latest stable release from [PyPi](https://pypi.org/).
+
+## Install from source
+
+Alternatively, with your virtualenv activated, you can clone the repository and install it from
+source by running `make install` on the `stable` branch:
 
 ```bash
 git clone git@github.com:HDI-Project/MLBlocks.git
 cd MLBlocks
+git checkout stable
 make install
 ```
 
-For development, you can use `make install-develop` instead in order to install all
-the required dependencies for testing and code linting.
+## Install for Development
+
+If you want to contribute to the project, a few more steps are required to make the project ready
+for development.
+
+First, please head to [the GitHub page of the project](https://github.com/HDI-Project/MLBlocks)
+and make a fork of the project under you own username by clicking on the **fork** button on the
+upper right corner of the page.
+
+Afterwards, clone your fork and create a branch from master with a descriptive name that includes
+the number of the issue that you are going to work on:
+
+```bash
+git clone git@github.com:{your username}/MLBlocks.git
+cd MLBlocks
+git branch issue-xx-cool-new-feature master
+git checkout issue-xx-cool-new-feature
+```
+
+Finally, install the project with the following command, which will install some additional
+dependencies for code linting and testing.
+
+```bash
+make install-develop
+```
+
+Make sure to use them regularly while developing by running the commands `make lint` and `make test`.
+
 
 ## MLPrimitives
 
@@ -75,12 +128,12 @@ with this command:
 pip install mlprimitives
 ```
 
-# Usage Example
+# Quickstart
 
 Below there is a short example about how to use MLBlocks to create a simple pipeline, fit it
 using demo data and use it to make predictions.
 
-Please make sure to having installed [MLPrimitives](https://github.com/HDI-Project/MLPrimitives)
+Please make sure to also having installed [MLPrimitives](https://github.com/HDI-Project/MLPrimitives)
 before following it.
 
 For advance usage and more detailed explanation about each component, please have a look
@@ -153,7 +206,7 @@ its `get_hyperparameters` method:
 }
 ```
 
-### Making predictions
+## Making predictions
 
 Once we have created the pipeline with the desired hyperparameters we can fit it
 and then use it to make predictions on new data.
@@ -180,7 +233,7 @@ to obtain predictions from the pipeline.
 array([3, 2, 1, ..., 1, 1, 2])
 ```
 
-## What's Next?
+# What's Next?
 
 If you want to learn more about how to tune the pipeline hyperparameters, save and load
 the pipelines using JSON annotations or build complex multi-branched pipelines, please

From 6324d3ffaad0fc45ad00fabc9c43de0b6e92ebf0 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 7 Aug 2019 12:38:25 +0200
Subject: [PATCH 049/160] Update README title

---
 README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 2d49d8a6..19f740ed 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,6 @@
 <i>An open source project from Data to AI Lab at MIT.</i>
 </p>
 
-
-
 <p align="left">
 <img width=20% src="https://dai.lids.mit.edu/wp-content/uploads/2018/06/mlblocks-icon.png" alt=“MLBlocks” />
 </p>
@@ -22,7 +20,7 @@ Pipelines and Primitives for Machine Learning and Data Science.
 * Documentation: https://HDI-Project.github.io/MLBlocks
 - Homepage: https://github.com/HDI-Project/MLBlocks
 
-# Overview
+# MLBlocks
 
 MLBlocks is a simple framework for composing end-to-end tunable Machine Learning Pipelines by
 seamlessly combining tools from any python library with a simple, common and uniform interface.

From c2771588f0d65e7ad3fdde9c71d3979ecd3dca3a Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 7 Aug 2019 13:05:45 +0200
Subject: [PATCH 050/160] Update dependencies

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 608e481d..4c371761 100644
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,7 @@
 tests_require = [
     'pytest>=3.4.2',
     'pytest-cov>=2.6.0',
-    'mlprimitives>=0.1.3,<0.2',
+    'mlprimitives>=0.2,<0.3',
     'urllib3>=1.20,<1.25',
     'setuptools>=41.0.0',
     'numpy<1.17',

From b65d7c77fd0b4275fb287e77867a8a43471ee3b3 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 7 Aug 2019 13:34:23 +0200
Subject: [PATCH 051/160] Fix docs quickstart

---
 Makefile                            | 2 +-
 docs/getting_started/quickstart.rst | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 6266033f..e54e1362 100644
--- a/Makefile
+++ b/Makefile
@@ -112,7 +112,7 @@ test: ## run tests quickly with the default Python
 
 .PHONY: test-all
 test-all: ## run tests on every Python version with tox
-	tox
+	tox -r
 
 .PHONY: coverage
 coverage: ## check code coverage quickly with the default Python
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
index 2887da05..31be89ee 100644
--- a/docs/getting_started/quickstart.rst
+++ b/docs/getting_started/quickstart.rst
@@ -24,6 +24,7 @@ them to the `MLPipeline class`_:
 
     from mlblocks import MLPipeline
     primitives = [
+        'mlprimitives.custom.feature_extraction.CategoricalEncoder',
         'mlprimitives.custom.feature_extraction.StringVectorizer',
         'sklearn.ensemble.RandomForestClassifier',
     ]

From c189a7f267613e71ba8632c1b5ca80bf1be79043 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 7 Aug 2019 14:20:24 +0200
Subject: [PATCH 052/160] Add metadata attribute

---
 mlblocks/mlblock.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index c32f978a..5727384e 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -146,22 +146,22 @@ def _get_tunable(cls, hyperparameters, init_params):
     def __init__(self, name, **kwargs):
         self.name = name
 
-        primitive = load_primitive(name)
+        self.metadata = load_primitive(name)
 
-        self.primitive = import_object(primitive['primitive'])
+        self.primitive = import_object(self.metadata['primitive'])
 
-        self._fit = primitive.get('fit', dict())
+        self._fit = self.metadata.get('fit', dict())
         self.fit_args = self._fit.get('args', [])
         self.fit_method = self._fit.get('method')
 
-        self._produce = primitive['produce']
+        self._produce = self.metadata['produce']
         self.produce_args = self._produce['args']
         self.produce_output = self._produce['output']
         self.produce_method = self._produce.get('method')
 
         self._class = bool(self.produce_method)
 
-        hyperparameters = primitive.get('hyperparameters', dict())
+        hyperparameters = self.metadata.get('hyperparameters', dict())
         init_params, fit_params, produce_params = self._extract_params(kwargs, hyperparameters)
 
         self._hyperparameters = init_params

From c78c1373f03aa82ef73cbfcffa2d48f051eb4cbf Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 8 Aug 2019 15:42:36 +0200
Subject: [PATCH 053/160] Allow passing fit and produce args as init params

---
 mlblocks/mlblock.py                     | 25 ++++++++-------
 mlblocks/mlpipeline.py                  | 37 +++++++++++-----------
 tests/features/test_fit_predicr_args.py | 42 +++++++++++++++++++++++++
 tests/test_mlblock.py                   | 25 ++++++++++++---
 tests/test_mlpipeline.py                |  1 -
 5 files changed, 96 insertions(+), 34 deletions(-)
 create mode 100644 tests/features/test_fit_predicr_args.py

diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index 5727384e..db24caa5 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -13,8 +13,11 @@
 
 def import_object(object_name):
     """Import an object from its Fully Qualified Name."""
-    package, name = object_name.rsplit('.', 1)
-    return getattr(importlib.import_module(package), name)
+    if isinstance(object_name, str):
+        package, name = object_name.rsplit('.', 1)
+        return getattr(importlib.import_module(package), name)
+
+    return object_name
 
 
 class MLBlock():
@@ -27,7 +30,7 @@ class MLBlock():
 
     Attributes:
         name (str):
-            Name given to this MLBlock.
+            Primitive name.
         metadata (dict):
             Additional information about this primitive
         primitive (object):
@@ -46,8 +49,8 @@ class MLBlock():
             function.
 
     Args:
-        name (str):
-            Name given to this MLBlock.
+        primitive (str or dict):
+            primitive name or primitive dictionary.
         **kwargs:
             Any additional arguments that will be used as hyperparameters or passed to the
             ``fit`` or ``produce`` methods.
@@ -143,10 +146,12 @@ def _get_tunable(cls, hyperparameters, init_params):
 
         return tunable
 
-    def __init__(self, name, **kwargs):
-        self.name = name
+    def __init__(self, primitive, **kwargs):
+        if isinstance(primitive, str):
+            primitive = load_primitive(primitive)
 
-        self.metadata = load_primitive(name)
+        self.metadata = primitive
+        self.name = primitive['name']
 
         self.primitive = import_object(self.metadata['primitive'])
 
@@ -252,11 +257,9 @@ def _get_method_kwargs(self, kwargs, method_args):
 
             if name in kwargs:
                 value = kwargs[name]
-
             elif 'default' in arg:
                 value = arg['default']
-
-            else:
+            elif arg.get('required', True):
                 raise TypeError("missing expected argument '{}'".format(name))
 
             method_kwargs[keyword] = value
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index e19a68ee..14e5ce67 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -87,16 +87,21 @@ def _build_blocks(self):
 
         block_names_count = Counter()
         for primitive in self.primitives:
+            if isinstance(primitive, str):
+                primitive_name = primitive
+            else:
+                primitive_name = primitive['name']
+
             try:
-                block_names_count.update([primitive])
-                block_count = block_names_count[primitive]
-                block_name = '{}#{}'.format(primitive, block_count)
+                block_names_count.update([primitive_name])
+                block_count = block_names_count[primitive_name]
+                block_name = '{}#{}'.format(primitive_name, block_count)
                 block_params = self.init_params.get(block_name, dict())
                 if not block_params:
-                    block_params = self.init_params.get(primitive, dict())
+                    block_params = self.init_params.get(primitive_name, dict())
                     if block_params and block_count > 1:
                         LOGGER.warning(("Non-numbered init_params are being used "
-                                        "for more than one block %s."), primitive)
+                                        "for more than one block %s."), primitive_name)
 
                 block = MLBlock(primitive, **block_params)
                 blocks[block_name] = block
@@ -330,10 +335,6 @@ def _get_block_args(self, block_name, block_args, context):
 
             if variable in context:
                 kwargs[name] = context[variable]
-            elif 'default' in arg:
-                kwargs[name] = arg['default']
-            elif arg.get('required', True):
-                raise ValueError('Input variable {} not found in context'.format(variable))
 
         return kwargs
 
@@ -517,11 +518,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
                   the value of that variable from the context will extracted and returned
                   after the produce method of that block has been called.
         """
-        context = {
-            'X': X,
-            'y': y
-        }
-        context.update(kwargs)
+        context = kwargs.copy()
+        if X is not None:
+            context['X'] = X
+
+        if y is not None:
+            context['y'] = y
 
         output_block, output_variable = self._get_output_spec(output_)
         last_block_name = self._get_block_name(-1)
@@ -624,10 +626,9 @@ def predict(self, X=None, output_=None, start_=None, **kwargs):
                   the value of that variable from the context will extracted and returned
                   after the produce method of that block has been called.
         """
-        context = {
-            'X': X
-        }
-        context.update(kwargs)
+        context = kwargs.copy()
+        if X is not None:
+            context['X'] = X
 
         output_block, output_variable = self._get_output_spec(output_)
 
diff --git a/tests/features/test_fit_predicr_args.py b/tests/features/test_fit_predicr_args.py
new file mode 100644
index 00000000..af4c0aea
--- /dev/null
+++ b/tests/features/test_fit_predicr_args.py
@@ -0,0 +1,42 @@
+from mlblocks.mlpipeline import MLPipeline
+
+
+def test_fit_predict_args_in_init():
+
+    def add(a, b):
+        return a + b
+
+    primitive = {
+        'name': 'add',
+        'primitive': add,
+        'produce': {
+            'args': [
+                {
+                    'name': 'a',
+                    'type': 'float',
+                },
+                {
+                    'name': 'b',
+                    'type': 'float',
+                },
+            ],
+            'output': [
+                {
+                    'type': 'float',
+                    'name': 'out'
+                }
+            ]
+        }
+    }
+
+    primitives = [primitive]
+    init_params = {
+        'add': {
+            'b': 10
+        }
+    }
+    pipeline = MLPipeline(primitives, init_params=init_params)
+
+    out = pipeline.predict(a=3)
+
+    assert out == 13
diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py
index 16f1c6d1..b4dbc637 100644
--- a/tests/test_mlblock.py
+++ b/tests/test_mlblock.py
@@ -323,6 +323,7 @@ def test__get_tunable_condition_match_null(self):
     @patch('mlblocks.mlblock.load_primitive')
     def test___init__(self, load_primitive_mock, import_object_mock, set_hps_mock):
         load_primitive_mock.return_value = {
+            'name': 'a_primitive_name',
             'primitive': 'a_primitive_name',
             'produce': {
                 'args': [
@@ -335,9 +336,22 @@ def test___init__(self, load_primitive_mock, import_object_mock, set_hps_mock):
             }
         }
 
-        mlblock = MLBlock('given_primitive_name', argument='value')
+        mlblock = MLBlock('a_primitive_name', argument='value')
 
-        assert mlblock.name == 'given_primitive_name'
+        assert mlblock.metadata == {
+            'name': 'a_primitive_name',
+            'primitive': 'a_primitive_name',
+            'produce': {
+                'args': [
+                    {
+                        'name': 'argument'
+                    }
+                ],
+                'output': [
+                ]
+            }
+        }
+        assert mlblock.name == 'a_primitive_name'
         assert mlblock.primitive == import_object_mock.return_value
         assert mlblock._fit == dict()
         assert mlblock.fit_args == list()
@@ -370,6 +384,7 @@ def test___init__(self, load_primitive_mock, import_object_mock, set_hps_mock):
     @patch('mlblocks.mlblock.load_primitive')
     def test___str__(self, load_primitive_mock, import_object_mock):
         load_primitive_mock.return_value = {
+            'name': 'a_primitive_name',
             'primitive': 'a_primitive_name',
             'produce': {
                 'args': [],
@@ -377,15 +392,16 @@ def test___str__(self, load_primitive_mock, import_object_mock):
             }
         }
 
-        mlblock = MLBlock('given_primitive_name')
+        mlblock = MLBlock('a_primitive_name')
 
-        assert str(mlblock) == 'MLBlock - given_primitive_name'
+        assert str(mlblock) == 'MLBlock - a_primitive_name'
 
     @patch('mlblocks.mlblock.import_object')
     @patch('mlblocks.mlblock.load_primitive')
     def test_get_tunable_hyperparameters(self, load_primitive_mock, import_object_mock):
         """get_tunable_hyperparameters has to return a copy of the _tunables attribute."""
         load_primitive_mock.return_value = {
+            'name': 'a_primitive_name',
             'primitive': 'a_primitive_name',
             'produce': {
                 'args': [],
@@ -433,6 +449,7 @@ def primitive(a_list_param):
         io_mock.return_value = primitive
 
         lp_mock.return_value = {
+            'name': 'a_primitive',
             'primitive': 'a_primitive',
             'produce': {
                 'args': [],
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 2011f5ae..327387f5 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -270,7 +270,6 @@ def test__get_block_args(self):
 
         expected = {
             'arg_1': 'arg_1_value',
-            'arg_2': 'arg_2_value',
             'arg_3': 'arg_3_value',
         }
         assert args == expected

From badd7f176e4d5df9b89d0e083224a0c33257c807 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 12 Aug 2019 13:10:31 +0200
Subject: [PATCH 054/160] Add release notest for v0.3.2

---
 HISTORY.md | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index e6b14674..f2654353 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,19 +1,23 @@
 Changelog
 =========
 
+0.3.2 - 2019-08-12
+------------------
+
+* Allow passing fit and produce arguments as `init_params` - [Issue #96](https://github.com/HDI-Project/MLBlocks/issues/96) by @csala
+* Support optional fit and produce args and arg defaults - [Issue #95](https://github.com/HDI-Project/MLBlocks/issues/95) by @csala
+* Isolate primitives from their hyperparameters dictionary - [Issue #94](https://github.com/HDI-Project/MLBlocks/issues/94) by @csala
+* Add functions to explore the available primitives and pipelines - [Issue #90](https://github.com/HDI-Project/MLBlocks/issues/90) by @csala
+* Add primitive caching New Feature - [Issue #22](https://github.com/HDI-Project/MLBlocks/issues/22) by @csala
+
 0.3.1 - Pipelines Discovery
 ---------------------------
 
-* Support flat hyperparameter dictionaries
-  [Issue #92](https://github.com/HDI-Project/MLBlocks/issues/92) by @csala
-* Load pipelines by name and register them as `entry_points`
-  [Issue #88](https://github.com/HDI-Project/MLBlocks/issues/88) by @csala
-* Implement partial re-fit
-  [Issue #61](https://github.com/HDI-Project/MLBlocks/issues/61) by @csala
-* Move argument parsing to MLBlock
-  [Issue #86](https://github.com/HDI-Project/MLBlocks/issues/86) by @csala
-* Allow getting intermediate outputs
-  [Issue #58](https://github.com/HDI-Project/MLBlocks/issues/58) by @csala
+* Support flat hyperparameter dictionaries - [Issue #92](https://github.com/HDI-Project/MLBlocks/issues/92) by @csala
+* Load pipelines by name and register them as `entry_points` - [Issue #88](https://github.com/HDI-Project/MLBlocks/issues/88) by @csala
+* Implement partial re-fit -[Issue #61](https://github.com/HDI-Project/MLBlocks/issues/61) by @csala
+* Move argument parsing to MLBlock - [Issue #86](https://github.com/HDI-Project/MLBlocks/issues/86) by @csala
+* Allow getting intermediate outputs - [Issue #58](https://github.com/HDI-Project/MLBlocks/issues/58) by @csala
 
 0.3.0 - New Primitives Discovery
 --------------------------------

From a094e9f1f7543758a058c8dbf3cb443854cfcf4d Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 12 Aug 2019 13:11:23 +0200
Subject: [PATCH 055/160] =?UTF-8?q?Bump=20version:=200.3.2-dev=20=E2=86=92?=
 =?UTF-8?q?=200.3.2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index b528aefe..9df5b210 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.3.2-dev'
+__version__ = '0.3.2'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 1967b27b..97bb08a0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.2-dev
+current_version = 0.3.2
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
diff --git a/setup.py b/setup.py
index 4c371761..3514f943 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/HDI-Project/MLBlocks',
-    version='0.3.2-dev',
+    version='0.3.2',
     zip_safe=False,
 )

From 14446f71c60213de2c3206e4beae25c5fa0f5d0e Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 12 Aug 2019 13:11:39 +0200
Subject: [PATCH 056/160] =?UTF-8?q?Bump=20version:=200.3.2=20=E2=86=92=200?=
 =?UTF-8?q?.3.3-dev?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 9df5b210..7f6e1eaf 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.3.2'
+__version__ = '0.3.3-dev'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 97bb08a0..a9051663 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.2
+current_version = 0.3.3-dev
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
diff --git a/setup.py b/setup.py
index 3514f943..870d1276 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/HDI-Project/MLBlocks',
-    version='0.3.2',
+    version='0.3.3-dev',
     zip_safe=False,
 )

From 65610157d2cea9d42545587b36ef4628d10c2893 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 12 Aug 2019 13:13:19 +0200
Subject: [PATCH 057/160] Typo in the release notes

---
 HISTORY.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HISTORY.md b/HISTORY.md
index f2654353..c3b00ce0 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -8,7 +8,7 @@ Changelog
 * Support optional fit and produce args and arg defaults - [Issue #95](https://github.com/HDI-Project/MLBlocks/issues/95) by @csala
 * Isolate primitives from their hyperparameters dictionary - [Issue #94](https://github.com/HDI-Project/MLBlocks/issues/94) by @csala
 * Add functions to explore the available primitives and pipelines - [Issue #90](https://github.com/HDI-Project/MLBlocks/issues/90) by @csala
-* Add primitive caching New Feature - [Issue #22](https://github.com/HDI-Project/MLBlocks/issues/22) by @csala
+* Add primitive caching - [Issue #22](https://github.com/HDI-Project/MLBlocks/issues/22) by @csala
 
 0.3.1 - Pipelines Discovery
 ---------------------------

From 8c03242cb648a68f997e3ee0b3b6557623bd3b35 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 3 Sep 2019 14:34:26 +0200
Subject: [PATCH 058/160] Advanced intermediate outputs

---
 mlblocks/mlpipeline.py                 | 430 ++++++++++++++-----------
 tests/features/test_partial_outputs.py |  15 +-
 tests/test_mlpipeline.py               |  43 ++-
 3 files changed, 280 insertions(+), 208 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 14e5ce67..b02561fe 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -5,6 +5,7 @@
 import json
 import logging
 from collections import Counter, OrderedDict, defaultdict
+from copy import deepcopy
 
 import numpy as np
 
@@ -72,6 +73,11 @@ class MLPipeline():
             given when stored in the context dictionary. This allows storing the output of
             different primitives in different variables, even if the primitive output name is
             the same one.
+        outputs (dict):
+            dictionary containing lists of output variables associated to a name.
+        verbose (bool):
+            whether to log the exceptions that occur when running the pipeline before
+            raising them or not.
     """
 
     def _get_tunable_hyperparameters(self):
@@ -114,7 +120,6 @@ def _build_blocks(self):
 
     @staticmethod
     def _get_pipeline_dict(pipeline, primitives):
-
         if isinstance(pipeline, dict):
             return pipeline
 
@@ -136,18 +141,50 @@ def _get_pipeline_dict(pipeline, primitives):
 
             return dict()
 
+    def _get_block_outputs(self, block_name):
+        """Get the list of output variables for the given block."""
+        block = self.blocks[block_name]
+        outputs = deepcopy(block.produce_output)
+        for output in outputs:
+            output['variable'] = '{}.{}'.format(block_name, output['name'])
+
+        return outputs
+
+    def _get_outputs(self, pipeline, outputs):
+        """Get the output definitions from the pipeline dictionary.
+
+        If the ``"default"`` entry does not exist, it is built using the
+        outputs from the last block in the pipeline.
+        """
+        outputs = outputs or pipeline.get('outputs')
+        if outputs is None:
+            outputs = dict()
+
+        if 'default' not in outputs:
+            outputs['default'] = self._get_block_outputs(self._last_block_name)
+
+        return outputs
+
+    def _get_block_name(self, index):
+        """Get the name of the block in the ``index`` position."""
+        return list(self.blocks.keys())[index]
+
     def __init__(self, pipeline=None, primitives=None, init_params=None,
-                 input_names=None, output_names=None):
+                 input_names=None, output_names=None, outputs=None, verbose=True):
 
         pipeline = self._get_pipeline_dict(pipeline, primitives)
 
         self.primitives = primitives or pipeline['primitives']
         self.init_params = init_params or pipeline.get('init_params', dict())
         self.blocks = self._build_blocks()
+        self._last_block_name = self._get_block_name(-1)
 
         self.input_names = input_names or pipeline.get('input_names', dict())
         self.output_names = output_names or pipeline.get('output_names', dict())
 
+        self.outputs = self._get_outputs(pipeline, outputs)
+        self.verbose = verbose
+
         tunable = pipeline.get('tunable_hyperparameters')
         if tunable is not None:
             self._tunable_hyperparameters = tunable
@@ -158,6 +195,122 @@ def __init__(self, pipeline=None, primitives=None, init_params=None,
         if hyperparameters:
             self.set_hyperparameters(hyperparameters)
 
+    def _get_str_output(self, output):
+        """Get the outputs that correspond to the str specification."""
+        if output in self.outputs:
+            return self.outputs[output]
+        elif output in self.blocks:
+            return self._get_block_outputs(output)
+        elif '.' in output:
+            block_name, variable_name = output.rsplit('.', 1)
+            block = self.blocks.get(block_name)
+            if not block:
+                raise ValueError('Invalid block name: {}'.format(block_name))
+
+            for variable in block.produce_output:
+                if variable['name'] == variable_name:
+                    return [{'name': variable_name, 'variable': output}]
+
+            raise ValueError('Block {} has no output {}'.format(block_name, variable_name))
+
+        raise ValueError('Invalid Output Specification: {}'.format(output))
+
+    def get_outputs(self, outputs='default'):
+        """Get the list of output variables that correspond to the specified outputs.
+
+        Outputs specification can either be a single string, a single integer, or a
+        list of strings and integers.
+
+        If strings are given, they can either be one of the named outputs that have
+        been specified on the pipeline definition or the name of a block, including the
+        counter number at the end, or a full variable specification following the format
+        ``{block-name}.{variable-name}``.
+
+        Alternatively, integers can be passed as indexes of the blocks from which to get
+        the outputs.
+
+        If output specifications that resolve to multiple output variables are given,
+        such as the named outputs or block names, all the variables are concatenated
+        together, in order, in a single variable list.
+
+        Args:
+            outputs (str, int or list[str or int]):
+                Single or list of output specifications.
+
+        Returns:
+            list:
+                List of dictionaries specifying all the output variables. Each
+                dictionary contains the entries ``name`` and ``variable``, as
+                well as any other metadata that may have been included in the
+                pipeline outputs or block produce outputs specification.
+
+        Raises:
+            ValueError:
+                If an output specification is not valid.
+            TypeError:
+                If the type of a specification is not an str or an int.
+        """
+        if not isinstance(outputs, (list, tuple)):
+            outputs = (outputs, )
+
+        computed = list()
+        for output in outputs:
+            if isinstance(output, str):
+                computed.extend(self._get_str_output(output))
+            elif isinstance(output, int):
+                block_name = self._get_block_name(output)
+                computed.extend(self._get_block_outputs(block_name))
+            else:
+                raise TypeError('Output Specification can only be str or int')
+
+        return computed
+
+    def get_output_names(self, outputs='default'):
+        """Get the names of the outputs that correspond to the given specification.
+
+        The indicated outputs will be resolved and the names of the output variables
+        will be returned as a single list.
+
+        Args:
+            outputs (str, int or list[str or int]):
+                Single or list of output specifications.
+
+        Returns:
+            list:
+                List of variable names
+
+        Raises:
+            ValueError:
+                If an output specification is not valid.
+            TypeError:
+                If the type of a specification is not an str or an int.
+        """
+        outputs = self.get_outputs(outputs)
+        return [output['name'] for output in outputs]
+
+    def get_output_variables(self, outputs='default'):
+        """Get the list of variable specifications of the given outputs.
+
+        The indicated outputs will be resolved and their variables specifications
+        will be returned as a single list.
+
+        Args:
+            outputs (str, int or list[str or int]):
+                Single or list of output specifications.
+
+        Returns:
+            list:
+                List of variable specifications.
+
+        Raises:
+            ValueError:
+                If an output specification is not valid.
+            TypeError:
+                If the type of a specification is not an str or an int.
+        """
+        outputs = self.get_outputs(outputs)
+        return [output['variable'] for output in outputs]
+
     @staticmethod
     def _flatten_dict(hyperparameters):
         return {
@@ -361,96 +514,48 @@ def _extract_outputs(self, block_name, outputs, block_outputs):
 
         return output_dict
 
-    def _get_block_name(self, index):
-        """Get the name of the block in the ``index`` position."""
-        return list(self.blocks.keys())[index]
-
-    def _get_output_spec(self, output):
-        """Parse the output specification and get a block name and a variable name.
-
-        The output specification can be of two types: int and str.
-
-        If it is an integer, it is interpreted as a block index, and the variable name
-        is considered to be ``None``, which means that the whole context will be returned.
-
-        If it is a string, it can be interpreted in three ways:
-
-            * **block name**: If the string matches a block name exactly, including
-            its hash and counter number ``#n`` at the end, the whole context will be
-            returned after that block is produced.
-            * **variable_name**: If the string does not match any block name and does
-            not contain any dot characted, ``'.'``, it will be considered a variable
-            name. In this case, the indicated variable will be extracted from the
-            context and returned after the last block has been produced.
-            * **block_name + variable_name**: If the complete string does not match a
-            block name but it contains at least one dot, ``'.'``, it will be split
-            in two parts on the last dot. If the first part of the string matches a
-            block name exactly, the second part of the string will be considered a
-            variable name, assuming the format ``{block_name}.{variable_name}``, and
-            the indicated variable will be extracted from the context and returned
-            after the block has been produced. Otherwise, if the extracted
-            ``block_name`` does not match a block name exactly, a ``ValueError``
-            will be raised.
+    def _update_outputs(self, block_name, output_variables, outputs, outputs_dict):
+        """Set the requested block outputs into the outputs list in the right place."""
+        for key, value in outputs_dict.items():
+            variable_name = '{}.{}'.format(block_name, key)
+            if variable_name in output_variables:
+                index = output_variables.index(variable_name)
+                outputs[index] = deepcopy(value)
+
+    def _fit_block(self, block, block_name, context):
+        """Get the block args from the context and fit the block."""
+        LOGGER.debug("Fitting block %s", block_name)
+        try:
+            fit_args = self._get_block_args(block_name, block.fit_args, context)
+            block.fit(**fit_args)
+        except Exception:
+            if self.verbose:
+                LOGGER.exception("Exception caught fitting MLBlock %s", block_name)
 
-        Args:
-            output (str or int):
-                Output specification as either a string or an integer.
+            raise
 
-        Raises:
-            ValueError:
-                If the output string contains dots but it does not match a block
-                name exactly
+    def _produce_block(self, block, block_name, context, output_variables, outputs):
+        """Get the block args from the context and produce the block.
 
-        Returns:
-            tuple:
-                The output is a tuple containing:
-                    * block_name (str): name of the block from which the output will be
-                        returned, including its counter number.
-                    * variable_name (str): Name of the variable to extract from the context.
-                        It can be ``None``, which means that the whole context is to be
-                        returned.
+        Afterwards, set the block outputs back into the context and update
+        the outputs list if necessary.
         """
-        # If None is given, both block and varialbe are None
-        if output is None:
-            return None, None
-
-        # If an int is given, it is a block index and there is no variable
-        if isinstance(output, int):
-            output = self._get_block_name(output)
-            return output, None
-
-        # If the string matches a block name, there is no variable
-        if output in self.blocks:
-            return output, None
-
-        # If there is at least one dot in the output, but it did not match
-        # a block name, it is considered to be {block_name}.{variable_name}
-        if '.' in output:
-            output_block, output_variable = output.rsplit('.', 1)
-            if output_block not in self.blocks:
-                raise ValueError('Unknown block name: {}'.format(output_block))
-
-            return output_block, output_variable
-
-        # If the given string is not a block name and it has no dots,
-        # it is considered to be a variable name to be extracted
-        # from the context after the last block has been produced
-        last_block_name = self._get_block_name(-1)
-        return last_block_name, output
-
-    def _get_output(self, output_variable, context):
-        """Get the specified output variable from the context.
-
-        If the variable name is ``None``, return the entire context.
-        """
-        if output_variable:
-            if output_variable not in context:
-                raise ValueError('Output variable {} not found in context'
-                                 .format(output_variable))
+        LOGGER.debug("Producing block %s", block_name)
+        try:
+            produce_args = self._get_block_args(block_name, block.produce_args, context)
+            block_outputs = block.produce(**produce_args)
 
-            return context[output_variable]
-        else:
-            return context
+            outputs_dict = self._extract_outputs(block_name, block_outputs, block.produce_output)
+            context.update(outputs_dict)
+
+            if output_variables:
+                self._update_outputs(block_name, output_variables, outputs, outputs_dict)
+
+        except Exception:
+            if self.verbose:
+                LOGGER.exception("Exception caught producing MLBlock %s", block_name)
+
+            raise
 
     def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
         """Fit the blocks of this pipeline.
@@ -467,35 +572,13 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
         Args:
             X:
                 Fit Data, which the pipeline will learn from.
-
             y:
                 Fit Data labels, which the pipeline will use to learn how to
                 behave.
 
-            output_ (str or int or None):
-                Output specification, which can be a string or an integer or None.
-
-                    * If it is None (default), nothing will be returned
-                    * If an integer is given, it is interpreted as the block number, and the whole
-                      context after running the specified block will be returned.
-                    * If it is a string, it can be interpreted in three ways:
-
-                        * **block name**: If the string matches a block name exactly, including
-                          its hash and counter number ``#n`` at the end, the whole context will be
-                          returned after that block is produced.
-                        * **variable_name**: If the string does not match any block name and does
-                          not contain any dot characted, ``'.'``, it will be considered a variable
-                          name. In this case, the indicated variable will be extracted from the
-                          context and returned after the last block has been produced.
-                        * **block_name + variable_name**: If the complete string does not match a
-                          block name but it contains at least one dot, ``'.'``, it will be split
-                          in two parts on the last dot. If the first part of the string matches a
-                          block name exactly, the second part of the string will be considered a
-                          variable name, assuming the format ``{block_name}.{variable_name}``, and
-                          the indicated variable will be extracted from the context and returned
-                          after the block has been produced. Otherwise, if the extracted
-                          ``block_name`` does not match a block name exactly, a ``ValueError``
-                          will be raised.
+            output_ (str or int or list or None):
+                Output specification, as required by ``get_outputs``. If ``None`` is given,
+                nothing will be returned.
 
             start_ (str or int or None):
                 Block index or block name to start processing from. The
@@ -510,13 +593,9 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
 
         Returns:
             None or dict or object:
-                * If no output is specified, nothing will be returned.
-                * If an output block has been specified without and output variable, the
-                  context dictionary will be returned after the produce method of that block
-                  has been called.
-                * If both an output block and an output variable have been specified,
-                  the value of that variable from the context will extracted and returned
-                  after the produce method of that block has been called.
+                * If no ``output`` is specified, nothing will be returned.
+                * If ``output_`` has been specified, either a single value or a
+                  tuple of values will be returned.
         """
         context = kwargs.copy()
         if X is not None:
@@ -525,8 +604,14 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
         if y is not None:
             context['y'] = y
 
-        output_block, output_variable = self._get_output_spec(output_)
-        last_block_name = self._get_block_name(-1)
+        if output_ is not None:
+            output_variables = self.get_output_variables(output_)
+            outputs = output_variables.copy()
+            output_blocks = {variable.rsplit('.', 1)[0] for variable in output_variables}
+        else:
+            output_variables = None
+            outputs = None
+            output_blocks = set()
 
         if isinstance(start_, int):
             start_ = self._get_block_name(start_)
@@ -539,34 +624,28 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
                     LOGGER.debug("Skipping block %s fit", block_name)
                     continue
 
-            LOGGER.debug("Fitting block %s", block_name)
-            try:
-                fit_args = self._get_block_args(block_name, block.fit_args, context)
-                block.fit(**fit_args)
-            except Exception:
-                LOGGER.exception("Exception caught fitting MLBlock %s", block_name)
-                raise
+            self._fit_block(block, block_name, context)
 
-            if (block_name != last_block_name) or (block_name == output_block):
-                LOGGER.debug("Producing block %s", block_name)
-                try:
-                    produce_args = self._get_block_args(block_name, block.produce_args, context)
-                    outputs = block.produce(**produce_args)
+            if (block_name != self._last_block_name) or (block_name in output_blocks):
+                self._produce_block(block, block_name, context, output_variables, outputs)
 
-                    output_dict = self._extract_outputs(block_name, outputs, block.produce_output)
-                    context.update(output_dict)
-                except Exception:
-                    LOGGER.exception("Exception caught producing MLBlock %s", block_name)
-                    raise
+                # We already captured the output from this block
+                if block_name in output_blocks:
+                    output_blocks.remove(block_name)
 
-            if block_name == output_block:
-                return self._get_output(output_variable, context)
+            # If there was an output_ but there are no pending
+            # outputs we are done.
+            if output_ is not None and not output_blocks:
+                if len(outputs) > 1:
+                    return tuple(outputs)
+                else:
+                    return outputs[0]
 
         if start_:
             # We skipped all the blocks up to the end
             raise ValueError('Unknown block name: {}'.format(start_))
 
-    def predict(self, X=None, output_=None, start_=None, **kwargs):
+    def predict(self, X=None, output_='default', start_=None, **kwargs):
         """Produce predictions using the blocks of this pipeline.
 
         Sequentially call the ``produce`` method of each block, capturing the
@@ -581,29 +660,9 @@ def predict(self, X=None, output_=None, start_=None, **kwargs):
             X:
                 Data which the pipeline will use to make predictions.
 
-            output_ (str or int or None):
-                Output specification, which can be a string or an integer or None.
-                    * If it is None (default), the output of the last block will be returned.
-                    * If an integer is given, it is interpreted as the block number, and the whole
-                      context after running the specified block will be returned.
-                    * If it is a string, it can be interpreted in three ways:
-
-                        * **block name**: If the string matches a block name exactly, including
-                          its hash and counter number ``#n`` at the end, the whole context will be
-                          returned after that block is produced.
-                        * **variable_name**: If the string does not match any block name and does
-                          not contain any dot characted, ``'.'``, it will be considered a variable
-                          name. In this case, the indicated variable will be extracted from the
-                          context and returned after the last block has been produced.
-                        * **block_name + variable_name**: If the complete string does not match a
-                          block name but it contains at least one dot, ``'.'``, it will be split
-                          in two parts on the last dot. If the first part of the string matches a
-                          block name exactly, the second part of the string will be considered a
-                          variable name, assuming the format ``{block_name}.{variable_name}``, and
-                          the indicated variable will be extracted from the context and returned
-                          after the block has been produced. Otherwise, if the extracted
-                          ``block_name`` does not match a block name exactly, a ``ValueError``
-                          will be raised.
+            output_ (str or int or list or None):
+                Output specification, as required by ``get_outputs``. If not specified
+                the ``default`` output will be returned.
 
             start_ (str or int or None):
                 Block index or block name to start processing from. The
@@ -617,20 +676,17 @@ def predict(self, X=None, output_=None, start_=None, **kwargs):
                 to the context dictionary and available for the blocks.
 
         Returns:
-            None or dict or object:
-                * If no output is specified, the output of the last block will be returned.
-                * If an output block has been specified without and output variable, the
-                  context dictionary will be returned after the produce method of that block
-                  has been called.
-                * If both an output block and an output variable have been specified,
-                  the value of that variable from the context will extracted and returned
-                  after the produce method of that block has been called.
+            object or tuple:
+                * If a single output is requested, it is returned alone.
+                * If multiple outputs have been requested, a tuple is returned.
         """
         context = kwargs.copy()
         if X is not None:
             context['X'] = X
 
-        output_block, output_variable = self._get_output_spec(output_)
+        output_variables = self.get_output_variables(output_)
+        outputs = output_variables.copy()
+        output_blocks = {variable.rsplit('.', 1)[0] for variable in output_variables}
 
         if isinstance(start_, int):
             start_ = self._get_block_name(start_)
@@ -643,27 +699,24 @@ def predict(self, X=None, output_=None, start_=None, **kwargs):
                     LOGGER.debug("Skipping block %s produce", block_name)
                     continue
 
-            LOGGER.debug("Producing block %s", block_name)
-            try:
-                produce_args = self._get_block_args(block_name, block.produce_args, context)
-                outputs = block.produce(**produce_args)
-                output_dict = self._extract_outputs(block_name, outputs, block.produce_output)
-                context.update(output_dict)
+            self._produce_block(block, block_name, context, output_variables, outputs)
 
-                if block_name == output_block:
-                    return self._get_output(output_variable, context)
+            # We already captured the output from this block
+            if block_name in output_blocks:
+                output_blocks.remove(block_name)
 
-            except Exception:
-                LOGGER.exception("Exception caught producing MLBlock %s", block_name)
-                raise
+            # If there was an output_ but there are no pending
+            # outputs we are done.
+            if not output_blocks:
+                if len(outputs) > 1:
+                    return tuple(outputs)
+                else:
+                    return outputs[0]
 
         if start_:
             # We skipped all the blocks up to the end
             raise ValueError('Unknown block name: {}'.format(start_))
 
-        if output_ is None:
-            return outputs
-
     def to_dict(self):
         """Return all the details of this MLPipeline in a dict.
 
@@ -710,7 +763,8 @@ def to_dict(self):
             'input_names': self.input_names,
             'output_names': self.output_names,
             'hyperparameters': self.get_hyperparameters(),
-            'tunable_hyperparameters': self._tunable_hyperparameters
+            'tunable_hyperparameters': self._tunable_hyperparameters,
+            'outputs': self.outputs,
         }
 
     def save(self, path):
diff --git a/tests/features/test_partial_outputs.py b/tests/features/test_partial_outputs.py
index ce28d457..7098dcd7 100644
--- a/tests/features/test_partial_outputs.py
+++ b/tests/features/test_partial_outputs.py
@@ -40,7 +40,7 @@ def test_fit_output(self):
         invalid_int = 10
         str_block = 'sklearn.preprocessing.StandardScaler#1'
         invalid_block = 'InvalidBlockName'
-        str_block_variable = 'sklearn.preprocessing.StandardScaler#1.y'
+        str_block_variable = 'sklearn.preprocessing.StandardScaler#1.X'
         invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid'
 
         # Run
@@ -58,16 +58,9 @@ def test_fit_output(self):
             [0.71269665, -0.645124, 0.39067021, 0.31740553],
             [0.26726124, -0.10752067, 1.36734573, 1.55176035]
         ])
-        y = np.array([1, 0, 0, 1, 2])
-        context = {
-            'X': X,
-            'y': y
-        }
-        almost_equal(context, int_out)
-        almost_equal(context, str_out)
-
-        almost_equal(y, str_out_variable)
-
+        almost_equal(X, int_out)
+        almost_equal(X, str_out)
+        almost_equal(X, str_out_variable)
         assert no_output is None
 
         # Run asserting exceptions
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 327387f5..3f6121ea 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -2,7 +2,7 @@
 
 from collections import OrderedDict
 from unittest import TestCase
-from unittest.mock import Mock, call, patch
+from unittest.mock import MagicMock, Mock, call, patch
 
 from mlblocks.mlpipeline import MLPipeline
 
@@ -12,7 +12,15 @@ class TestMLPipline(TestCase):
     @patch('mlblocks.mlpipeline.LOGGER')
     @patch('mlblocks.mlpipeline.MLBlock')
     def test___init__(self, mlblock_mock, logger_mock):
-        blocks = [Mock(), Mock(), Mock(), Mock()]
+        blocks = [Mock(), Mock(), Mock()]
+        last_block = Mock()
+        last_block.produce_output = [
+            {
+                'name': 'y',
+                'type': 'array'
+            }
+        ]
+        blocks.append(last_block)
         mlblock_mock.side_effect = blocks
 
         primitives = [
@@ -61,6 +69,16 @@ def test___init__(self, mlblock_mock, logger_mock):
             'another.primitive.Name#1': blocks[2].get_tunable_hyperparameters.return_value,
             'another.primitive.Name#2': blocks[3].get_tunable_hyperparameters.return_value
         }
+        assert mlpipeline.outputs == {
+            'default': [
+                {
+                    'name': 'y',
+                    'type': 'array',
+                    'variable': 'another.primitive.Name#2.y'
+                }
+            ]
+        }
+        assert mlpipeline.verbose
 
         expected_calls = [
             call('a.primitive.Name', an_argument='value'),
@@ -75,8 +93,9 @@ def test___init__(self, mlblock_mock, logger_mock):
             'a.primitive.Name'
         )
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
     def test_get_tunable_hyperparameters(self):
-        mlpipeline = MLPipeline(list())
+        mlpipeline = MLPipeline(['a_primitive'])
         tunable = dict()
         mlpipeline._tunable_hyperparameters = tunable
 
@@ -85,8 +104,9 @@ def test_get_tunable_hyperparameters(self):
         assert returned == tunable
         assert returned is not tunable
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
     def test_get_tunable_hyperparameters_flat(self):
-        mlpipeline = MLPipeline(list())
+        mlpipeline = MLPipeline(['a_primitive'])
         tunable = {
             'block_1': {
                 'hp_1': {
@@ -141,6 +161,7 @@ def test_get_tunable_hyperparameters_flat(self):
         }
         assert returned == expected
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
     def test_get_hyperparameters(self):
         block_1 = Mock()
         block_1.get_hyperparameters.return_value = {
@@ -155,7 +176,7 @@ def test_get_hyperparameters(self):
             ('a.primitive.Name#1', block_1),
             ('a.primitive.Name#2', block_2),
         ))
-        mlpipeline = MLPipeline(list())
+        mlpipeline = MLPipeline(['a_primitive'])
         mlpipeline.blocks = blocks
 
         hyperparameters = mlpipeline.get_hyperparameters()
@@ -172,6 +193,7 @@ def test_get_hyperparameters(self):
         block_1.get_hyperparameters.assert_called_once_with()
         block_2.get_hyperparameters.assert_called_once_with()
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
     def test_get_hyperparameters_flat(self):
         block_1 = Mock()
         block_1.get_hyperparameters.return_value = {
@@ -186,7 +208,7 @@ def test_get_hyperparameters_flat(self):
             ('a.primitive.Name#1', block_1),
             ('a.primitive.Name#2', block_2),
         ))
-        mlpipeline = MLPipeline(list())
+        mlpipeline = MLPipeline(['a_primitive'])
         mlpipeline.blocks = blocks
 
         hyperparameters = mlpipeline.get_hyperparameters(flat=True)
@@ -199,6 +221,7 @@ def test_get_hyperparameters_flat(self):
         block_1.get_hyperparameters.assert_called_once_with()
         block_2.get_hyperparameters.assert_called_once_with()
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
     def test_set_hyperparameters(self):
         block_1 = Mock()
         block_2 = Mock()
@@ -206,7 +229,7 @@ def test_set_hyperparameters(self):
             ('a.primitive.Name#1', block_1),
             ('a.primitive.Name#2', block_2),
         ))
-        mlpipeline = MLPipeline(list())
+        mlpipeline = MLPipeline(['a_primitive'])
         mlpipeline.blocks = blocks
 
         hyperparameters = {
@@ -219,6 +242,7 @@ def test_set_hyperparameters(self):
         block_1.set_hyperparameters.assert_not_called()
         block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'})
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
     def test_set_hyperparameters_flat(self):
         block_1 = Mock()
         block_2 = Mock()
@@ -226,7 +250,7 @@ def test_set_hyperparameters_flat(self):
             ('a.primitive.Name#1', block_1),
             ('a.primitive.Name#2', block_2),
         ))
-        mlpipeline = MLPipeline(list())
+        mlpipeline = MLPipeline(['a_primitive'])
         mlpipeline.blocks = blocks
 
         hyperparameters = {
@@ -237,13 +261,14 @@ def test_set_hyperparameters_flat(self):
         block_1.set_hyperparameters.assert_not_called()
         block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'})
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
     def test__get_block_args(self):
         input_names = {
             'a_block': {
                 'arg_3': 'arg_3_alt'
             }
         }
-        pipeline = MLPipeline(list(), input_names=input_names)
+        pipeline = MLPipeline(['a_primitive'], input_names=input_names)
 
         block_args = [
             {

From dabf1a13dfa05f73e70e9b4578b08bca7ace7edc Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 3 Sep 2019 22:16:10 +0200
Subject: [PATCH 059/160] Add unit tests

---
 mlblocks/mlpipeline.py                 |   4 +-
 tests/features/test_partial_outputs.py |  57 ++++++---
 tests/test_mlpipeline.py               | 164 ++++++++++++++++++++++++-
 3 files changed, 201 insertions(+), 24 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index b02561fe..9de286cb 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -209,7 +209,9 @@ def _get_str_output(self, output):
 
             for variable in block.produce_output:
                 if variable['name'] == variable_name:
-                    return [{'name': variable_name, 'variable': output}]
+                    output_variable = deepcopy(variable)
+                    output_variable['variable'] = output
+                    return [output_variable]
 
             raise ValueError('Block {} has no output {}'.format(block_name, variable_name))
 
diff --git a/tests/features/test_partial_outputs.py b/tests/features/test_partial_outputs.py
index 7098dcd7..d31d2dd8 100644
--- a/tests/features/test_partial_outputs.py
+++ b/tests/features/test_partial_outputs.py
@@ -3,7 +3,6 @@
 
 import numpy as np
 
-from mlblocks.datasets import load_iris
 from mlblocks.mlpipeline import MLPipeline
 
 
@@ -15,6 +14,7 @@ def almost_equal(obj1, obj2):
         for key, value in obj1.items():
             if key not in obj2:
                 raise AssertionError("{} not in {}".format(key, obj2))
+
             almost_equal(value, obj2[key])
 
     else:
@@ -23,9 +23,14 @@ def almost_equal(obj1, obj2):
 
 class TestPartialOutputs(TestCase):
     def setUp(self):
-        dataset = load_iris()
-
-        self.X_train, self.X_test, self.y_train, self.y_test = dataset.get_splits(1)
+        self.X = np.array([
+            [1, 0, 0, 0, 0],
+            [0, 1, 0, 0, 0],
+            [0, 0, 1, 0, 0],
+            [0, 0, 0, 1, 0],
+            [0, 0, 0, 0, 1],
+        ])
+        self.y = np.array([0, 0, 0, 0, 1])
 
     def test_fit_output(self):
 
@@ -36,6 +41,8 @@ def test_fit_output(self):
         ]
         pipeline = MLPipeline(primitives)
 
+        named = 'default'
+        list_ = ['default', 0]
         int_block = 0
         invalid_int = 10
         str_block = 'sklearn.preprocessing.StandardScaler#1'
@@ -44,20 +51,30 @@ def test_fit_output(self):
         invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid'
 
         # Run
-        int_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=int_block)
-        str_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=str_block)
-        str_out_variable = pipeline.fit(self.X_train[0:5], self.y_train[0:5],
+        named_out = pipeline.fit(self.X, self.y, output_=named)
+        list_out = pipeline.fit(self.X, self.y, output_=list_)
+        int_out = pipeline.fit(self.X, self.y, output_=int_block)
+        str_out = pipeline.fit(self.X, self.y, output_=str_block)
+        str_out_variable = pipeline.fit(self.X, self.y,
                                         output_=str_block_variable)
-        no_output = pipeline.fit(self.X_train, self.y_train)
+        no_output = pipeline.fit(self.X, self.y)
 
         # Assert successful calls
         X = np.array([
-            [0.71269665, -1.45152899, 0.55344946, 0.31740553],
-            [0.26726124, 1.23648766, -1.1557327, -1.0932857],
-            [-1.95991577, 0.967686, -1.1557327, -1.0932857],
-            [0.71269665, -0.645124, 0.39067021, 0.31740553],
-            [0.26726124, -0.10752067, 1.36734573, 1.55176035]
+            [2., -0.5, -0.5, -0.5, -0.5],
+            [-0.5, 2., -0.5, -0.5, -0.5],
+            [-0.5, -0.5, 2., -0.5, -0.5],
+            [-0.5, -0.5, -0.5, 2., -0.5],
+            [-0.5, -0.5, -0.5, -0.5, 2.],
         ])
+        y = np.array([
+            0, 0, 0, 0, 1
+        ])
+
+        almost_equal(named_out, y)
+        assert len(list_out) == 2
+        almost_equal(list_out[0], y)
+        almost_equal(list_out[1], X)
         almost_equal(X, int_out)
         almost_equal(X, str_out)
         almost_equal(X, str_out_variable)
@@ -65,13 +82,13 @@ def test_fit_output(self):
 
         # Run asserting exceptions
         with self.assertRaises(IndexError):
-            pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_int)
+            pipeline.fit(self.X, self.y, output_=invalid_int)
 
         with self.assertRaises(ValueError):
-            pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_block)
+            pipeline.fit(self.X, self.y, output_=invalid_block)
 
         with self.assertRaises(ValueError):
-            pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_variable)
+            pipeline.fit(self.X, self.y, output_=invalid_variable)
 
     def test_fit_start(self):
         # Setup variables
@@ -87,8 +104,8 @@ def test_fit_start(self):
 
         # Run first block
         context = {
-            'X': self.X_train,
-            'y': self.y_train
+            'X': self.X,
+            'y': self.y
         }
         int_start = 1
         str_start = 'sklearn.linear_model.LogisticRegression#1'
@@ -106,7 +123,7 @@ def test_predict_start(self):
             'sklearn.linear_model.LogisticRegression'
         ]
         pipeline = MLPipeline(primitives)
-        pipeline.fit(self.X_train, self.y_train)
+        pipeline.fit(self.X, self.y)
 
         # Mock the first block
         block_mock = Mock()
@@ -114,7 +131,7 @@ def test_predict_start(self):
 
         # Run first block
         context = {
-            'X': self.X_train,
+            'X': self.X,
         }
         int_start = 1
         str_start = 'sklearn.linear_model.LogisticRegression#1'
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 3f6121ea..7062e38e 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -107,7 +107,7 @@ def test_get_tunable_hyperparameters(self):
     @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
     def test_get_tunable_hyperparameters_flat(self):
         mlpipeline = MLPipeline(['a_primitive'])
-        tunable = {
+        mlpipeline._tunable_hyperparameters = {
             'block_1': {
                 'hp_1': {
                     'type': 'int',
@@ -133,7 +133,6 @@ def test_get_tunable_hyperparameters_flat(self):
                 }
             }
         }
-        mlpipeline._tunable_hyperparameters = tunable
 
         returned = mlpipeline.get_tunable_hyperparameters(flat=True)
 
@@ -299,9 +298,168 @@ def test__get_block_args(self):
         }
         assert args == expected
 
-    def test__get_outputs(self):
+    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    def test__get_outputs_no_outputs(self):
+        self_ = Mock()
+        self_._last_block_name = 'last_block'
+        self_._get_block_outputs.return_value = ['some', 'outputs']
+
+        pipeline = dict()
+        outputs = None
+        returned = MLPipeline._get_outputs(self_, pipeline, outputs)
+
+        expected = {
+            'default': ['some', 'outputs']
+        }
+        assert returned == expected
+
+        self_._get_block_outputs.assert_called_once_with('last_block')
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    def test__get_outputs_defaults(self):
+        self_ = Mock()
+
+        pipeline = dict()
+        outputs = {
+            'default': ['some', 'outputs']
+        }
+        returned = MLPipeline._get_outputs(self_, pipeline, outputs)
+
+        expected = {
+            'default': ['some', 'outputs']
+        }
+        assert returned == expected
+        self_._get_block_outputs.assert_not_called()
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    def test__get_outputs_additional(self):
+        self_ = Mock()
+
+        pipeline = {
+            'outputs': {
+                'default': ['some', 'outputs'],
+                'additional': ['other', 'outputs']
+            }
+        }
+        outputs = None
+        returned = MLPipeline._get_outputs(self_, pipeline, outputs)
+
+        expected = {
+            'default': ['some', 'outputs'],
+            'additional': ['other', 'outputs']
+        }
+        assert returned == expected
+        self_._get_block_outputs.assert_not_called()
+
+    def test_get_outputs_str(self):
+        pass
+
+    def test_get_outputs_int(self):
+        pass
+
+    def test_get_outputs_list_of_str(self):
+        pass
+
+    def test_get_outputs_list_of_int(self):
         pass
 
+    def test_get_outputs_named_outputs(self):
+        pass
+
+    def test_get_outputs_combination(self):
+        pass
+
+    @patch('mlblocks.mlpipeline.MLBlock')
+    def test_get_outputs_invalid(self, mlblock_mock):
+        outputs = {
+            'default': [
+                {
+                    'name': 'a_name',
+                    'variable': 'a_variable',
+                    'type': 'a_type',
+                }
+            ],
+            'debug': [
+                {
+                    'name': 'another_name',
+                    'variable': 'another_variable',
+                }
+            ]
+        }
+        mlblock_mock.side_effect = [MagicMock(), MagicMock()]
+        pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs)
+
+        pipeline.blocks['a_primitive#1'].produce_output = [
+            {
+                'name': 'output',
+                'type': 'whatever'
+            }
+        ]
+        pipeline.blocks['another_primitive#1'].produce_output = [
+            {
+                'name': 'something',
+            }
+        ]
+
+        returned = pipeline.get_outputs(['default', 'debug', -1, 'a_primitive#1.output'])
+
+        expected = [
+            {
+                'name': 'a_name',
+                'variable': 'a_variable',
+                'type': 'a_type'
+            },
+            {
+                'name': 'another_name',
+                'variable': 'another_variable',
+            },
+            {
+                'name': 'something',
+                'variable': 'another_primitive#1.something',
+            },
+            {
+                'name': 'output',
+                'type': 'whatever',
+                'variable': 'a_primitive#1.output'
+            }
+        ]
+
+        assert returned == expected
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    def test_get_output_names(self):
+        outputs = {
+            'default': [
+                {
+                    'name': 'a_name',
+                    'variable': 'a_variable',
+                    'type': 'a_type',
+                }
+            ]
+        }
+        pipeline = MLPipeline(['a_primitive'], outputs=outputs)
+
+        names = pipeline.get_output_names()
+
+        assert names == ['a_name']
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    def test_get_output_variables(self):
+        outputs = {
+            'default': [
+                {
+                    'name': 'a_name',
+                    'variable': 'a_variable',
+                    'type': 'a_type',
+                }
+            ]
+        }
+        pipeline = MLPipeline(['a_primitive'], outputs=outputs)
+
+        names = pipeline.get_output_variables()
+
+        assert names == ['a_variable']
+
     def test_fit(self):
         pass
 

From 1d74b20a4ded2d95b46067a6f280b989d16312ef Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 9 Sep 2019 11:29:20 +0200
Subject: [PATCH 060/160] Release notes for v0.3.3

---
 HISTORY.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index c3b00ce0..f3dc0a32 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,11 @@
 Changelog
 =========
 
+0.3.3 - 2019-09-09
+------------------
+
+* Improved intermediate outputs management - [Issue #105](https://github.com/HDI-Project/MLBlocks/issues/105) by @csala
+
 0.3.2 - 2019-08-12
 ------------------
 

From 3b06ab885dce7fc601468dc6ee8f1b901bfba8ff Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 9 Sep 2019 11:29:24 +0200
Subject: [PATCH 061/160] =?UTF-8?q?Bump=20version:=200.3.3-dev=20=E2=86=92?=
 =?UTF-8?q?=200.3.3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 7f6e1eaf..b85b1de0 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.3.3-dev'
+__version__ = '0.3.3'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index a9051663..0fa10faa 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.3-dev
+current_version = 0.3.3
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
diff --git a/setup.py b/setup.py
index 870d1276..4104d912 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/HDI-Project/MLBlocks',
-    version='0.3.3-dev',
+    version='0.3.3',
     zip_safe=False,
 )

From 0dcb324c1d7e09e0a04d61dd400105afa7d6c8a5 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 9 Sep 2019 11:29:43 +0200
Subject: [PATCH 062/160] =?UTF-8?q?Bump=20version:=200.3.3=20=E2=86=92=200?=
 =?UTF-8?q?.3.4-dev?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index b85b1de0..8c30609e 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.3.3'
+__version__ = '0.3.4-dev'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 0fa10faa..de7507c0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.3
+current_version = 0.3.4-dev
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
diff --git a/setup.py b/setup.py
index 4104d912..421dbbd6 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/HDI-Project/MLBlocks',
-    version='0.3.3',
+    version='0.3.4-dev',
     zip_safe=False,
 )

From eb78b55e5466b918ed1be8f0a699538b69d26773 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Fri, 27 Sep 2019 16:13:45 +0200
Subject: [PATCH 063/160] support importing class methods

---
 mlblocks/mlblock.py   | 12 ++++++++++--
 tests/test_mlblock.py | 35 ++++++++++++++++++++++++++++++-----
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index db24caa5..f570165b 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -13,9 +13,17 @@
 
 def import_object(object_name):
     """Import an object from its Fully Qualified Name."""
+
     if isinstance(object_name, str):
-        package, name = object_name.rsplit('.', 1)
-        return getattr(importlib.import_module(package), name)
+        parent_name, attribute = object_name.rsplit('.', 1)
+        try:
+            parent = importlib.import_module(parent_name)
+        except ImportError:
+            grand_parent_name, parent_name = parent_name.rsplit('.', 1)
+            grand_parent = importlib.import_module(grand_parent_name)
+            parent = getattr(grand_parent, parent_name)
+
+        return getattr(parent, attribute)
 
     return object_name
 
diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py
index b4dbc637..355015d0 100644
--- a/tests/test_mlblock.py
+++ b/tests/test_mlblock.py
@@ -3,19 +3,44 @@
 from unittest import TestCase
 from unittest.mock import MagicMock, Mock, patch
 
-from mlblocks.mlblock import MLBlock, import_object
+import pytest
 
-# import pytest
+from mlblocks.mlblock import MLBlock, import_object
 
 
 class DummyClass:
+    def a_method(self):
+        pass
+
+
+def dummy_function():
     pass
 
 
-def test_import_object():
-    dummy_class = import_object(__name__ + '.DummyClass')
+class TestImportObject(TestCase):
+
+    def test_class(self):
+        imported = import_object(__name__ + '.DummyClass')
+
+        assert imported is DummyClass
+
+    def test_class_method(self):
+        imported = import_object(__name__ + '.DummyClass.a_method')
+
+        assert imported is DummyClass.a_method
+
+    def test_function(self):
+        imported = import_object(__name__ + '.dummy_function')
+
+        assert imported is dummy_function
+
+    def test_bad_object_name(self):
+        with pytest.raises(AttributeError):
+            import_object(__name__ + '.InvalidName')
 
-    assert dummy_class is DummyClass
+    def test_bad_module(self):
+        with pytest.raises(ModuleNotFoundError):
+            import_object('an.invalid.module')
 
 
 class TestMLBlock(TestCase):

From 8d53d2a94fcc4323dcc0faf37bc38535b8198fa7 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Fri, 27 Sep 2019 16:17:25 +0200
Subject: [PATCH 064/160] Add configuration to upload release candidates to
 PyPI

---
 Makefile             | 26 +++++++++++++++++++-------
 mlblocks/__init__.py |  2 +-
 setup.cfg            | 15 +++++++++++----
 setup.py             |  2 +-
 tox.ini              |  2 +-
 5 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/Makefile b/Makefile
index e54e1362..6e8dd203 100644
--- a/Makefile
+++ b/Makefile
@@ -155,7 +155,7 @@ publish: dist ## package and upload a release
 
 .PHONY: bumpversion-release
 bumpversion-release: ## Merge master to stable and bumpversion release
-	git checkout stable
+	git checkout stable || git checkout -b stable
 	git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable"
 	bumpversion release
 	git push --tags origin stable
@@ -167,6 +167,10 @@ bumpversion-patch: ## Merge stable to master and bumpversion patch
 	bumpversion --no-tag patch
 	git push
 
+.PHONY: bumpversion-candidate
+bumpversion-candidate: ## Bump the version to the next candidate
+	bumpversion candidate --no-tag
+
 .PHONY: bumpversion-minor
 bumpversion-minor: ## Bump the version the next minor skipping the release
 	bumpversion --no-tag minor
@@ -175,23 +179,31 @@ bumpversion-minor: ## Bump the version the next minor skipping the release
 bumpversion-major: ## Bump the version the next major skipping the release
 	bumpversion --no-tag major
 
-CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD)
-CHANGELOG_LINES := $(shell git diff HEAD..stable HISTORY.md | wc -l)
+CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null)
+CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l)
 
-.PHONY: check-release
-check-release: ## Check if the release can be made
+.PHONY: check-master
+check-master: ## Check if we are in master branch
 ifneq ($(CURRENT_BRANCH),master)
 	$(error Please make the release from master branch\n)
 endif
+
+.PHONY: check-history
+check-history: ## Check if HISTORY.md has been modified
 ifeq ($(CHANGELOG_LINES),0)
 	$(error Please insert the release notes in HISTORY.md before releasing)
-else
-	@echo "A new release can be made"
 endif
 
+.PHONY: check-release
+check-release: check-master check-history ## Check if the release can be made
+	@echo "A new release can be made"
+
 .PHONY: release
 release: check-release bumpversion-release publish bumpversion-patch
 
+.PHONY: release-candidate
+release-candidate: check-master publish bumpversion-candidate
+
 .PHONY: release-minor
 release-minor: check-release bumpversion-minor release
 
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 8c30609e..3ede651e 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.3.4-dev'
+__version__ = '0.3.4.dev0'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index de7507c0..563c9c5c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,18 +1,21 @@
 [bumpversion]
-current_version = 0.3.4-dev
+current_version = 0.3.4.dev0
 commit = True
 tag = True
-parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
+parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<candidate>\d+))?
 serialize = 
-	{major}.{minor}.{patch}-{release}
+	{major}.{minor}.{patch}-{release}{candidate}
 	{major}.{minor}.{patch}
 
 [bumpversion:part:release]
 optional_value = release
+first_value = dev
 values = 
 	dev
 	release
 
+[bumpversion:part:candidate]
+
 [bumpversion:file:setup.py]
 search = version='{current_version}'
 replace = version='{new_version}'
@@ -34,8 +37,12 @@ include_trailing_comment = True
 line_length = 99
 lines_between_types = 0
 multi_line_output = 4
-not_skip = __init__.py
 use_parentheses = True
+not_skip = __init__.py
+skip_glob = *.bak
+
+[metadata]
+description-file = README.md
 
 [aliases]
 test = pytest
diff --git a/setup.py b/setup.py
index 421dbbd6..abc43800 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/HDI-Project/MLBlocks',
-    version='0.3.4-dev',
+    version='0.3.4.dev0',
     zip_safe=False,
 )
diff --git a/tox.ini b/tox.ini
index 76529366..666eeab0 100644
--- a/tox.ini
+++ b/tox.ini
@@ -14,7 +14,7 @@ setenv =
     PYTHONPATH = {toxinidir}
 extras = test
 commands =
-    /usr/bin/env python -m pytest --cov=mlblocks
+    /usr/bin/env make test
 
 
 [testenv:lint]

From 001561a169229ec652c08e4ace32dc3023d4bbd4 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Fri, 27 Sep 2019 16:20:05 +0200
Subject: [PATCH 065/160] Add release-candidate documentation

---
 CONTRIBUTING.rst | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index 4fce53bf..4c01093e 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -195,3 +195,33 @@ Once this is done, run of the following commands:
 3. If you are releasing a major version::
 
     make release-major
+
+Release Candidates
+~~~~~~~~~~~~~~~~~~
+
+Sometimes it is necessary or convenient to upload a release candidate to PyPi as a pre-release,
+in order to make some of the new features available for testing on other projects before they
+are included in an actual full-blown release.
+
+In order to perform such an action, you can execute::
+
+    make release-candidate
+
+This will perform the following actions:
+
+1. Build and upload the current version to PyPi as a pre-release, with the format ``X.Y.Z.devN``
+
+2. Bump the current version to the next release candidate, ``X.Y.Z.dev(N+1)``
+
+After this is done, the new pre-release can be installed by including the ``dev`` section in the
+dependency specification, either in ``setup.py``::
+
+    install_requires = [
+        ...
+        'mlblocks>=X.Y.Z.dev',
+        ...
+    ]
+
+or in command line::
+
+    pip install 'mlblocks>=X.Y.Z.dev'

From 45f9ae2ae6b50a4a6ae1e50f326c130f5a571d69 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Fri, 27 Sep 2019 16:25:45 +0200
Subject: [PATCH 066/160] Fix error in python3.5 due to an inexisting Exception
 type

---
 tests/test_mlblock.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_mlblock.py b/tests/test_mlblock.py
index 355015d0..93adb0dd 100644
--- a/tests/test_mlblock.py
+++ b/tests/test_mlblock.py
@@ -39,7 +39,7 @@ def test_bad_object_name(self):
             import_object(__name__ + '.InvalidName')
 
     def test_bad_module(self):
-        with pytest.raises(ModuleNotFoundError):
+        with pytest.raises(ImportError):
             import_object('an.invalid.module')
 
 

From 09aa6e9466956d3883895b573d5ea03ad257b501 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 3 Oct 2019 20:20:52 +0200
Subject: [PATCH 067/160] Fix release-candidate version format

---
 setup.cfg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 563c9c5c..a122a298 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,9 +2,9 @@
 current_version = 0.3.4.dev0
 commit = True
 tag = True
-parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<candidate>\d+))?
+parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
 serialize = 
-	{major}.{minor}.{patch}-{release}{candidate}
+	{major}.{minor}.{patch}.{release}{candidate}
 	{major}.{minor}.{patch}
 
 [bumpversion:part:release]

From 2b5d7900a22ce72b7b59ef85a0f024ffec0a0c0d Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 3 Oct 2019 20:21:28 +0200
Subject: [PATCH 068/160] =?UTF-8?q?Bump=20version:=200.3.4.dev0=20?=
 =?UTF-8?q?=E2=86=92=200.3.4.dev1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 3ede651e..81b45593 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.3.4.dev0'
+__version__ = '0.3.4.dev1'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index a122a298..0c2ea21a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.4.dev0
+current_version = 0.3.4.dev1
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index abc43800..da4bb6f3 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/HDI-Project/MLBlocks',
-    version='0.3.4.dev0',
+    version='0.3.4.dev1',
     zip_safe=False,
 )

From d790938e8cee8528fe90725eb145fde3c6bd99e2 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 30 Oct 2019 13:19:30 -0400
Subject: [PATCH 069/160] New partial output with context - WIP

---
 mlblocks/mlpipeline.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 9de286cb..d0d67f8f 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -145,8 +145,11 @@ def _get_block_outputs(self, block_name):
         """Get the list of output variables for the given block."""
         block = self.blocks[block_name]
         outputs = deepcopy(block.produce_output)
+        output_names = self.output_names.get(block_name, dict())
         for output in outputs:
-            output['variable'] = '{}.{}'.format(block_name, output['name'])
+            name = output['name']
+            context_name = output_names.get(name, name)
+            output['variable'] = '{}.{}'.format(block_name, context_name)
 
         return outputs
 
@@ -606,7 +609,7 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
         if y is not None:
             context['y'] = y
 
-        if output_ is not None:
+        if isinstance(output_, str):
             output_variables = self.get_output_variables(output_)
             outputs = output_variables.copy()
             output_blocks = {variable.rsplit('.', 1)[0] for variable in output_variables}
@@ -615,6 +618,9 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
             outputs = None
             output_blocks = set()
 
+        if isinstance(output_, int):
+            output_ = self._get_block_name(output_)
+
         if isinstance(start_, int):
             start_ = self._get_block_name(start_)
 
@@ -628,16 +634,19 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
 
             self._fit_block(block, block_name, context)
 
-            if (block_name != self._last_block_name) or (block_name in output_blocks):
+            last_block = block_name != self._last_block_name
+            if last_block or (block_name == output_) or (block_name in output_blocks):
                 self._produce_block(block, block_name, context, output_variables, outputs)
 
                 # We already captured the output from this block
                 if block_name in output_blocks:
                     output_blocks.remove(block_name)
+                elif block_name == output_:
+                    return context
 
             # If there was an output_ but there are no pending
             # outputs we are done.
-            if output_ is not None and not output_blocks:
+            if output_variables is not None and not output_blocks:
                 if len(outputs) > 1:
                     return tuple(outputs)
                 else:

From 1a0eb099d177753d07797757bf0b3ae9a20de1f2 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 31 Oct 2019 12:59:46 -0400
Subject: [PATCH 070/160] Allow getting full context in partial outputs

---
 mlblocks/mlpipeline.py                 |  61 +++++----
 tests/features/test_partial_outputs.py |   7 +-
 tests/test_mlpipeline.py               | 175 +++++++++++++++++++------
 3 files changed, 171 insertions(+), 72 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index d0d67f8f..21aa7ecc 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -4,6 +4,7 @@
 
 import json
 import logging
+import re
 from collections import Counter, OrderedDict, defaultdict
 from copy import deepcopy
 
@@ -198,12 +199,15 @@ def __init__(self, pipeline=None, primitives=None, init_params=None,
         if hyperparameters:
             self.set_hyperparameters(hyperparameters)
 
+        self._re_block_name = re.compile(r'(^[^#]+#\d+)(\..*)?')
+
     def _get_str_output(self, output):
         """Get the outputs that correspond to the str specification."""
         if output in self.outputs:
             return self.outputs[output]
         elif output in self.blocks:
-            return self._get_block_outputs(output)
+            return [{'name': output, 'variable': output}]
+            # return self._get_block_outputs(output)
         elif '.' in output:
             block_name, variable_name = output.rsplit('.', 1)
             block = self.blocks.get(block_name)
@@ -260,11 +264,11 @@ def get_outputs(self, outputs='default'):
 
         computed = list()
         for output in outputs:
+            if isinstance(output, int):
+                output = self._get_block_name(output)
+
             if isinstance(output, str):
                 computed.extend(self._get_str_output(output))
-            elif isinstance(output, int):
-                block_name = self._get_block_name(output)
-                computed.extend(self._get_block_outputs(block_name))
             else:
                 raise TypeError('Output Specification can only be str or int')
 
@@ -316,6 +320,18 @@ def get_output_variables(self, outputs='default'):
         outputs = self.get_outputs(outputs)
         return [output['variable'] for output in outputs]
 
+    def _extract_block_name(self, variable_name):
+        return self._re_block_name.search(variable_name).group(1)
+
+    def _prepare_outputs(self, outputs):
+        output_variables = self.get_output_variables(outputs)
+        outputs = output_variables.copy()
+        output_blocks = {
+            self._extract_block_name(variable)
+            for variable in output_variables
+        }
+        return output_variables, outputs, output_blocks
+
     @staticmethod
     def _flatten_dict(hyperparameters):
         return {
@@ -519,13 +535,11 @@ def _extract_outputs(self, block_name, outputs, block_outputs):
 
         return output_dict
 
-    def _update_outputs(self, block_name, output_variables, outputs, outputs_dict):
+    def _update_outputs(self, variable_name, output_variables, outputs, value):
         """Set the requested block outputs into the outputs list in the right place."""
-        for key, value in outputs_dict.items():
-            variable_name = '{}.{}'.format(block_name, key)
-            if variable_name in output_variables:
-                index = output_variables.index(variable_name)
-                outputs[index] = deepcopy(value)
+        if variable_name in output_variables:
+            index = output_variables.index(variable_name)
+            outputs[index] = deepcopy(value)
 
     def _fit_block(self, block, block_name, context):
         """Get the block args from the context and fit the block."""
@@ -554,7 +568,12 @@ def _produce_block(self, block, block_name, context, output_variables, outputs):
             context.update(outputs_dict)
 
             if output_variables:
-                self._update_outputs(block_name, output_variables, outputs, outputs_dict)
+                if block_name in output_variables:
+                    self._update_outputs(block_name, output_variables, outputs, context)
+                else:
+                    for key, value in outputs_dict.items():
+                        variable_name = '{}.{}'.format(block_name, key)
+                        self._update_outputs(variable_name, output_variables, outputs, value)
 
         except Exception:
             if self.verbose:
@@ -609,17 +628,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
         if y is not None:
             context['y'] = y
 
-        if isinstance(output_, str):
-            output_variables = self.get_output_variables(output_)
-            outputs = output_variables.copy()
-            output_blocks = {variable.rsplit('.', 1)[0] for variable in output_variables}
-        else:
+        if output_ is None:
             output_variables = None
             outputs = None
             output_blocks = set()
-
-        if isinstance(output_, int):
-            output_ = self._get_block_name(output_)
+        else:
+            output_variables, outputs, output_blocks = self._prepare_outputs(output_)
 
         if isinstance(start_, int):
             start_ = self._get_block_name(start_)
@@ -634,15 +648,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
 
             self._fit_block(block, block_name, context)
 
-            last_block = block_name != self._last_block_name
-            if last_block or (block_name == output_) or (block_name in output_blocks):
+            if (block_name != self._last_block_name) or (block_name in output_blocks):
                 self._produce_block(block, block_name, context, output_variables, outputs)
 
                 # We already captured the output from this block
                 if block_name in output_blocks:
                     output_blocks.remove(block_name)
-                elif block_name == output_:
-                    return context
 
             # If there was an output_ but there are no pending
             # outputs we are done.
@@ -695,9 +706,7 @@ def predict(self, X=None, output_='default', start_=None, **kwargs):
         if X is not None:
             context['X'] = X
 
-        output_variables = self.get_output_variables(output_)
-        outputs = output_variables.copy()
-        output_blocks = {variable.rsplit('.', 1)[0] for variable in output_variables}
+        output_variables, outputs, output_blocks = self._prepare_outputs(output_)
 
         if isinstance(start_, int):
             start_ = self._get_block_name(start_)
diff --git a/tests/features/test_partial_outputs.py b/tests/features/test_partial_outputs.py
index d31d2dd8..50739cea 100644
--- a/tests/features/test_partial_outputs.py
+++ b/tests/features/test_partial_outputs.py
@@ -70,13 +70,14 @@ def test_fit_output(self):
         y = np.array([
             0, 0, 0, 0, 1
         ])
+        context = {'X': X, 'y': y}
 
         almost_equal(named_out, y)
         assert len(list_out) == 2
         almost_equal(list_out[0], y)
-        almost_equal(list_out[1], X)
-        almost_equal(X, int_out)
-        almost_equal(X, str_out)
+        almost_equal(list_out[1], context)
+        almost_equal(context, int_out)
+        almost_equal(context, str_out)
         almost_equal(X, str_out_variable)
         assert no_output is None
 
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 7062e38e..f2edc36f 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -2,25 +2,36 @@
 
 from collections import OrderedDict
 from unittest import TestCase
-from unittest.mock import MagicMock, Mock, call, patch
+from unittest.mock import MagicMock, call, patch
 
+import pytest
+
+from mlblocks.mlblock import MLBlock
 from mlblocks.mlpipeline import MLPipeline
 
 
+def get_mlblock_mock(*args, **kwargs):
+    return MagicMock(autospec=MLBlock)
+
+
 class TestMLPipline(TestCase):
 
     @patch('mlblocks.mlpipeline.LOGGER')
     @patch('mlblocks.mlpipeline.MLBlock')
     def test___init__(self, mlblock_mock, logger_mock):
-        blocks = [Mock(), Mock(), Mock()]
-        last_block = Mock()
+        blocks = [
+            get_mlblock_mock(),
+            get_mlblock_mock(),
+            get_mlblock_mock(),
+            get_mlblock_mock()
+        ]
+        last_block = blocks[-1]
         last_block.produce_output = [
             {
                 'name': 'y',
                 'type': 'array'
             }
         ]
-        blocks.append(last_block)
         mlblock_mock.side_effect = blocks
 
         primitives = [
@@ -93,7 +104,7 @@ def test___init__(self, mlblock_mock, logger_mock):
             'a.primitive.Name'
         )
 
-    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_get_tunable_hyperparameters(self):
         mlpipeline = MLPipeline(['a_primitive'])
         tunable = dict()
@@ -104,7 +115,7 @@ def test_get_tunable_hyperparameters(self):
         assert returned == tunable
         assert returned is not tunable
 
-    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_get_tunable_hyperparameters_flat(self):
         mlpipeline = MLPipeline(['a_primitive'])
         mlpipeline._tunable_hyperparameters = {
@@ -160,13 +171,13 @@ def test_get_tunable_hyperparameters_flat(self):
         }
         assert returned == expected
 
-    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_get_hyperparameters(self):
-        block_1 = Mock()
+        block_1 = get_mlblock_mock()
         block_1.get_hyperparameters.return_value = {
             'a': 'a'
         }
-        block_2 = Mock()
+        block_2 = get_mlblock_mock()
         block_2.get_hyperparameters.return_value = {
             'b': 'b',
             'c': 'c',
@@ -192,13 +203,13 @@ def test_get_hyperparameters(self):
         block_1.get_hyperparameters.assert_called_once_with()
         block_2.get_hyperparameters.assert_called_once_with()
 
-    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_get_hyperparameters_flat(self):
-        block_1 = Mock()
+        block_1 = get_mlblock_mock()
         block_1.get_hyperparameters.return_value = {
             'a': 'a'
         }
-        block_2 = Mock()
+        block_2 = get_mlblock_mock()
         block_2.get_hyperparameters.return_value = {
             'b': 'b',
             'c': 'c',
@@ -220,10 +231,10 @@ def test_get_hyperparameters_flat(self):
         block_1.get_hyperparameters.assert_called_once_with()
         block_2.get_hyperparameters.assert_called_once_with()
 
-    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_set_hyperparameters(self):
-        block_1 = Mock()
-        block_2 = Mock()
+        block_1 = get_mlblock_mock()
+        block_2 = get_mlblock_mock()
         blocks = OrderedDict((
             ('a.primitive.Name#1', block_1),
             ('a.primitive.Name#2', block_2),
@@ -241,10 +252,10 @@ def test_set_hyperparameters(self):
         block_1.set_hyperparameters.assert_not_called()
         block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'})
 
-    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_set_hyperparameters_flat(self):
-        block_1 = Mock()
-        block_2 = Mock()
+        block_1 = get_mlblock_mock()
+        block_2 = get_mlblock_mock()
         blocks = OrderedDict((
             ('a.primitive.Name#1', block_1),
             ('a.primitive.Name#2', block_2),
@@ -260,7 +271,7 @@ def test_set_hyperparameters_flat(self):
         block_1.set_hyperparameters.assert_not_called()
         block_2.set_hyperparameters.assert_called_once_with({'some': 'arg'})
 
-    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test__get_block_args(self):
         input_names = {
             'a_block': {
@@ -298,9 +309,10 @@ def test__get_block_args(self):
         }
         assert args == expected
 
-    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test__get_outputs_no_outputs(self):
-        self_ = Mock()
+        self_ = MagicMock(autospec=MLPipeline)
+
         self_._last_block_name = 'last_block'
         self_._get_block_outputs.return_value = ['some', 'outputs']
 
@@ -315,9 +327,9 @@ def test__get_outputs_no_outputs(self):
 
         self_._get_block_outputs.assert_called_once_with('last_block')
 
-    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test__get_outputs_defaults(self):
-        self_ = Mock()
+        self_ = MagicMock(autospec=MLPipeline)
 
         pipeline = dict()
         outputs = {
@@ -331,9 +343,9 @@ def test__get_outputs_defaults(self):
         assert returned == expected
         self_._get_block_outputs.assert_not_called()
 
-    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test__get_outputs_additional(self):
-        self_ = Mock()
+        self_ = MagicMock(autospec=MLPipeline)
 
         pipeline = {
             'outputs': {
@@ -351,26 +363,90 @@ def test__get_outputs_additional(self):
         assert returned == expected
         self_._get_block_outputs.assert_not_called()
 
-    def test_get_outputs_str(self):
-        pass
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_get_outputs_str_named(self):
+        outputs = {
+            'default': [
+                {
+                    'name': 'a_name',
+                    'variable': 'a_variable',
+                    'type': 'a_type',
+                }
+            ],
+            'debug': [
+                {
+                    'name': 'another_name',
+                    'variable': 'another_variable',
+                }
+            ]
+        }
+        pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs)
+        returned = pipeline.get_outputs('debug')
+
+        expected = [
+            {
+                'name': 'another_name',
+                'variable': 'another_variable',
+            }
+        ]
+
+        assert returned == expected
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_get_outputs_str_variable(self):
+        pipeline = MLPipeline(['a_primitive', 'another_primitive'])
+
+        pipeline.blocks['a_primitive#1'].produce_output = [
+            {
+                'name': 'output',
+                'type': 'whatever'
+            }
+        ]
+
+        returned = pipeline.get_outputs('a_primitive#1.output')
+
+        expected = [
+            {
+                'name': 'output',
+                'type': 'whatever',
+                'variable': 'a_primitive#1.output'
+            }
+        ]
+
+        assert returned == expected
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_get_outputs_str_block(self):
+        pipeline = MLPipeline(['a_primitive', 'another_primitive'])
+
+        returned = pipeline.get_outputs('a_primitive#1')
+
+        expected = [
+            {
+                'name': 'a_primitive#1',
+                'variable': 'a_primitive#1',
+            }
+        ]
+
+        assert returned == expected
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_get_outputs_int(self):
-        pass
+        pipeline = MLPipeline(['a_primitive', 'another_primitive'])
 
-    def test_get_outputs_list_of_str(self):
-        pass
+        returned = pipeline.get_outputs(-1)
 
-    def test_get_outputs_list_of_int(self):
-        pass
+        expected = [
+            {
+                'name': 'another_primitive#1',
+                'variable': 'another_primitive#1',
+            }
+        ]
 
-    def test_get_outputs_named_outputs(self):
-        pass
+        assert returned == expected
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_get_outputs_combination(self):
-        pass
-
-    @patch('mlblocks.mlpipeline.MLBlock')
-    def test_get_outputs_invalid(self, mlblock_mock):
         outputs = {
             'default': [
                 {
@@ -386,7 +462,6 @@ def test_get_outputs_invalid(self, mlblock_mock):
                 }
             ]
         }
-        mlblock_mock.side_effect = [MagicMock(), MagicMock()]
         pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs)
 
         pipeline.blocks['a_primitive#1'].produce_output = [
@@ -414,8 +489,8 @@ def test_get_outputs_invalid(self, mlblock_mock):
                 'variable': 'another_variable',
             },
             {
-                'name': 'something',
-                'variable': 'another_primitive#1.something',
+                'name': 'another_primitive#1',
+                'variable': 'another_primitive#1',
             },
             {
                 'name': 'output',
@@ -426,7 +501,21 @@ def test_get_outputs_invalid(self, mlblock_mock):
 
         assert returned == expected
 
-    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_get_outputs_invalid(self):
+        pipeline = MLPipeline(['a_primitive'])
+
+        pipeline.blocks['a_primitive#1'].produce_output = [
+            {
+                'name': 'output',
+                'type': 'whatever'
+            }
+        ]
+
+        with pytest.raises(ValueError):
+            pipeline.get_outputs('a_primitive#1.invalid')
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_get_output_names(self):
         outputs = {
             'default': [
@@ -443,7 +532,7 @@ def test_get_output_names(self):
 
         assert names == ['a_name']
 
-    @patch('mlblocks.mlpipeline.MLBlock', new=MagicMock())
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_get_output_variables(self):
         outputs = {
             'default': [

From 6019adfeff7f167dcea2d7ec2ffc9a7864c16fee Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 31 Oct 2019 15:26:35 -0400
Subject: [PATCH 071/160] =?UTF-8?q?Bump=20version:=200.3.4.dev1=20?=
 =?UTF-8?q?=E2=86=92=200.3.4.dev2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 81b45593..936c210f 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.3.4.dev1'
+__version__ = '0.3.4.dev2'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 0c2ea21a..58f63f5c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.4.dev1
+current_version = 0.3.4.dev2
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index da4bb6f3..60c97534 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/HDI-Project/MLBlocks',
-    version='0.3.4.dev1',
+    version='0.3.4.dev2',
     zip_safe=False,
 )

From b7baf968b4be9d1b59384b13c9c55d5d5da3299e Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 4 Nov 2019 10:05:26 -0500
Subject: [PATCH 072/160] Release notes for v0.3.4

---
 HISTORY.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index f3dc0a32..5b5d4f0b 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,12 @@
 Changelog
 =========
 
+0.3.4 - 2019-11-01
+------------------
+
+* Ability to return intermediate context - [Issue #110](https://github.com/HDI-Project/MLBlocks/issues/110) by @csala
+* Support for static or class methods - [Issue #107](https://github.com/HDI-Project/MLBlocks/issues/107) by @csala
+
 0.3.3 - 2019-09-09
 ------------------
 

From b0cd3808f3291d9bd043362ff2e827ac626f8ef9 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 4 Nov 2019 10:05:27 -0500
Subject: [PATCH 073/160] =?UTF-8?q?Bump=20version:=200.3.4.dev2=20?=
 =?UTF-8?q?=E2=86=92=200.3.4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 936c210f..e4aa9838 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.3.4.dev2'
+__version__ = '0.3.4'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 58f63f5c..709511b4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.4.dev2
+current_version = 0.3.4
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 60c97534..7b243501 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/HDI-Project/MLBlocks',
-    version='0.3.4.dev2',
+    version='0.3.4',
     zip_safe=False,
 )

From 6ede62caed212b84067021fca2d3b29d187a8554 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 4 Nov 2019 10:05:40 -0500
Subject: [PATCH 074/160] =?UTF-8?q?Bump=20version:=200.3.4=20=E2=86=92=200?=
 =?UTF-8?q?.3.5.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index e4aa9838..618e7a55 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.3.4'
+__version__ = '0.3.5.dev0'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 709511b4..61208b1f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.4
+current_version = 0.3.5.dev0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 7b243501..09483fb3 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/HDI-Project/MLBlocks',
-    version='0.3.4',
+    version='0.3.5.dev0',
     zip_safe=False,
 )

From 3ce7d89c3e81743c73400c0694ebb6e893acbc51 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 12 Dec 2019 15:38:54 +0100
Subject: [PATCH 075/160] Update paper references

---
 README.md | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 19f740ed..7c152fa3 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ Pipelines and Primitives for Machine Learning and Data Science.
 
 * Free software: MIT license
 * Documentation: https://HDI-Project.github.io/MLBlocks
-- Homepage: https://github.com/HDI-Project/MLBlocks
+* Homepage: https://github.com/HDI-Project/MLBlocks
 
 # MLBlocks
 
@@ -237,10 +237,33 @@ If you want to learn more about how to tune the pipeline hyperparameters, save a
 the pipelines using JSON annotations or build complex multi-branched pipelines, please
 check our [documentation](https://HDI-Project.github.io/MLBlocks).
 
-# History
+## Citing MLBlocks
+
+If you use MLBlocks, please consider citing our related papers.
+
+For the current design of MLBlocks and its usage within the larger *Machine Learning Bazaar* project at
+the MIT Data To AI Lab, please see:
+
+Micah J. Smith, Carles Sala, James Max Kanter, and Kalyan Veeramachaneni. ["The Machine Learning Bazaar:
+Harnessing the ML Ecosystem for Effective System Development."](https://arxiv.org/abs/1905.08942) arXiv
+Preprint 1905.08942. 2019.
+
+``` bibtex
+@article{smith2019mlbazaar,
+  author = {Smith, Micah J. and Sala, Carles and Kanter, James Max and Veeramachaneni, Kalyan},
+  title = {The Machine Learning Bazaar: Harnessing the ML Ecosystem for Effective System Development},
+  journal = {arXiv e-prints},
+  year = {2019},
+  eid = {arXiv:1905.08942},
+  pages = {arXiv:1905.08942},
+  archivePrefix = {arXiv},
+  eprint = {1905.08942},
+}
+```
+
+For the first MLBlocks version from 2015, designed for only multi table, multi entity temporal data, please
+refer to Bryan Collazo’s thesis:
 
-In its first iteration in 2015, MLBlocks was designed for only multi table, multi entity temporal
-data. A good reference to see our design rationale at that time is Bryan Collazo’s thesis:
 * [Machine learning blocks](https://dai.lids.mit.edu/wp-content/uploads/2018/06/Mlblocks_Bryan.pdf).
   Bryan Collazo. Masters thesis, MIT EECS, 2015.
 

From 949c8b1d36abe3792e38bed3501645fde279a075 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 12 Dec 2019 15:53:36 +0100
Subject: [PATCH 076/160] Restrict dependency versions

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 09483fb3..1e8ef2ad 100644
--- a/setup.py
+++ b/setup.py
@@ -25,6 +25,7 @@
     'urllib3>=1.20,<1.25',
     'setuptools>=41.0.0',
     'numpy<1.17',
+    'python-dateutil<2.8.1,>=2.1',
 ]
 
 

From 6b8381a069e235d8083a02cac0e72550db3955e2 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 19 Dec 2019 19:18:09 +0100
Subject: [PATCH 077/160] Allow loading from json. Deprecate old methods

---
 mlblocks/discovery.py   | 44 ++++++++++++++----------------------
 mlblocks/mlpipeline.py  | 11 +++++++++
 tests/test_discovery.py | 49 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 27 deletions(-)

diff --git a/mlblocks/discovery.py b/mlblocks/discovery.py
index 9a1dbef5..24a469da 100644
--- a/mlblocks/discovery.py
+++ b/mlblocks/discovery.py
@@ -198,6 +198,12 @@ def get_pipelines_paths():
     return _PIPELINES_PATHS + _load_entry_points('pipelines')
 
 
+def _load_json(json_path):
+    with open(json_path, 'r') as json_file:
+        LOGGER.debug('Loading %s', json_path)
+        return json.load(json_file)
+
+
 def _load(name, paths):
     """Locate and load the JSON annotation in any of the given paths.
 
@@ -206,8 +212,7 @@ def _load(name, paths):
 
     Args:
         name (str):
-            name of the JSON to look for. The name should not contain the
-            ``.json`` extension, as it will be added dynamically.
+            Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension.
         paths (list):
             list of paths where the primitives will be looked for.
 
@@ -215,6 +220,9 @@ def _load(name, paths):
         dict:
             The content of the JSON annotation file loaded into a dict.
     """
+    if os.path.isfile(name):
+        return _load_json(name)
+
     for base_path in paths:
         parts = name.split('.')
         number_of_parts = len(parts)
@@ -225,12 +233,7 @@ def _load(name, paths):
             json_path = os.path.join(folder, filename)
 
             if os.path.isfile(json_path):
-                with open(json_path, 'r') as json_file:
-                    LOGGER.debug('Loading %s from %s', name, json_path)
-                    return json.load(json_file)
-
-
-_PRIMITIVES = dict()
+                return _load_json(json_path)
 
 
 def load_primitive(name):
@@ -241,8 +244,7 @@ def load_primitive(name):
 
     Args:
         name (str):
-            name of the JSON to look for. The name should not contain the
-            ``.json`` extension, as it will be added dynamically.
+            Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension.
 
     Returns:
         dict:
@@ -252,20 +254,13 @@ def load_primitive(name):
         ValueError:
             A ``ValueError`` will be raised if the primitive cannot be found.
     """
-    primitive = _PRIMITIVES.get(name)
+    primitive = _load(name, get_primitives_paths())
     if primitive is None:
-        primitive = _load(name, get_primitives_paths())
-        if primitive is None:
-            raise ValueError("Unknown primitive: {}".format(name))
-
-        _PRIMITIVES[name] = primitive
+        raise ValueError("Unknown primitive: {}".format(name))
 
     return primitive
 
 
-_PIPELINES = dict()
-
-
 def load_pipeline(name):
     """Locate and load the pipeline JSON annotation.
 
@@ -274,8 +269,7 @@ def load_pipeline(name):
 
     Args:
         name (str):
-            name of the JSON to look for. The name should not contain the
-            ``.json`` extension, as it will be added dynamically.
+            Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension.
 
     Returns:
         dict:
@@ -285,13 +279,9 @@ def load_pipeline(name):
         ValueError:
             A ``ValueError`` will be raised if the pipeline cannot be found.
     """
-    pipeline = _PIPELINES.get(name)
+    pipeline = _load(name, get_pipelines_paths())
     if pipeline is None:
-        pipeline = _load(name, get_pipelines_paths())
-        if pipeline is None:
-            raise ValueError("Unknown pipeline: {}".format(name))
-
-        _PIPELINES[name] = pipeline
+        raise ValueError("Unknown pipeline: {}".format(name))
 
     return pipeline
 
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 21aa7ecc..962d7c19 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -5,6 +5,7 @@
 import json
 import logging
 import re
+import warnings
 from collections import Counter, OrderedDict, defaultdict
 from copy import deepcopy
 
@@ -814,6 +815,11 @@ def from_dict(cls, metadata):
                 A new MLPipeline instance with the details found in the
                 given specification dictionary.
         """
+        warnings.warn(
+            'MLPipeline.form_dict(pipeline_dict) is deprecated and will be removed in a '
+            'later release. Please use MLPipeline(dict) instead,',
+            DeprecationWarning
+        )
         return cls(metadata)
 
     @classmethod
@@ -831,6 +837,11 @@ def load(cls, path):
                 A new MLPipeline instance with the specification found
                 in the JSON file.
         """
+        warnings.warn(
+            'MLPipeline.load(path) is deprecated and will be removed in a later release. '
+            'Please use MLPipeline(path) instead,',
+            DeprecationWarning
+        )
         with open(path, 'r') as in_file:
             metadata = json.load(in_file)
 
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index dc3eca87..a11fc02c 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -162,6 +162,55 @@ def test__load_success():
 
         assert primitive == loaded
 
+def test__load_json_path():
+    primitive = {
+        'name': 'temp.primitive',
+        'primitive': 'temp.primitive'
+    }
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        paths = [tempdir]
+        primitive_path = os.path.join(tempdir, 'temp.primitive.json')
+        with open(primitive_path, 'w') as primitive_file:
+            json.dump(primitive, primitive_file, indent=4)
+
+        loaded = discovery._load(primitive_path, paths)
+
+        assert primitive == loaded
+
+
+def _load(name, paths):
+    """Locate and load the JSON annotation in any of the given paths.
+
+    All the given paths will be scanned to find a JSON file with the given name,
+    and as soon as a JSON with the given name is found it is returned.
+
+    Args:
+        name (str):
+            Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension.
+        paths (list):
+            list of paths where the primitives will be looked for.
+
+    Returns:
+        dict:
+            The content of the JSON annotation file loaded into a dict.
+    """
+    if os.path.isfile(name):
+        return _load_json(name)
+
+    for base_path in paths:
+        parts = name.split('.')
+        number_of_parts = len(parts)
+
+        for folder_parts in range(number_of_parts):
+            folder = os.path.join(base_path, *parts[:folder_parts])
+            filename = '.'.join(parts[folder_parts:]) + '.json'
+            json_path = os.path.join(folder, filename)
+
+            if os.path.isfile(json_path):
+                return _load_json(json_path)
+
+
 
 @patch('mlblocks.discovery.get_primitives_paths')
 @patch('mlblocks.discovery._load')

From be684dd593f89cd21bd74efb53d6aa97b8c02970 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 19 Dec 2019 19:26:05 +0100
Subject: [PATCH 078/160] Remove unneeded code

---
 tests/test_discovery.py | 34 +---------------------------------
 1 file changed, 1 insertion(+), 33 deletions(-)

diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index a11fc02c..25e6e444 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -162,6 +162,7 @@ def test__load_success():
 
         assert primitive == loaded
 
+
 def test__load_json_path():
     primitive = {
         'name': 'temp.primitive',
@@ -179,39 +180,6 @@ def test__load_json_path():
         assert primitive == loaded
 
 
-def _load(name, paths):
-    """Locate and load the JSON annotation in any of the given paths.
-
-    All the given paths will be scanned to find a JSON file with the given name,
-    and as soon as a JSON with the given name is found it is returned.
-
-    Args:
-        name (str):
-            Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension.
-        paths (list):
-            list of paths where the primitives will be looked for.
-
-    Returns:
-        dict:
-            The content of the JSON annotation file loaded into a dict.
-    """
-    if os.path.isfile(name):
-        return _load_json(name)
-
-    for base_path in paths:
-        parts = name.split('.')
-        number_of_parts = len(parts)
-
-        for folder_parts in range(number_of_parts):
-            folder = os.path.join(base_path, *parts[:folder_parts])
-            filename = '.'.join(parts[folder_parts:]) + '.json'
-            json_path = os.path.join(folder, filename)
-
-            if os.path.isfile(json_path):
-                return _load_json(json_path)
-
-
-
 @patch('mlblocks.discovery.get_primitives_paths')
 @patch('mlblocks.discovery._load')
 def test__load_primitive_value_error(load_mock, gpp_mock):

From 1920227548edbb11b851b1864044cabc577b8e03 Mon Sep 17 00:00:00 2001
From: Erica Chiu <ejchiu@mit.edu>
Date: Thu, 9 Jan 2020 11:41:23 -0500
Subject: [PATCH 079/160] Add get_inputs function

---
 mlblocks/mlpipeline.py   |  78 ++++++++++++++++++++++++++++
 tests/test_mlpipeline.py | 107 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 185 insertions(+)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 21aa7ecc..dce30cfe 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -154,6 +154,45 @@ def _get_block_outputs(self, block_name):
 
         return outputs
 
+    def _get_block_outputs_dict(self, block_name):
+        """Get dictionary of output variables for the given block."""
+        block = self.blocks[block_name]
+        outputs = deepcopy(block.produce_output)
+        output_names = self.output_names.get(block_name, dict())
+        output_dict = {}
+        for output in outputs:
+            name = output['name']
+            context_name = output_names.get(name, name)
+            output_dict[context_name] = output
+
+        return output_dict
+
+    def _get_block_inputs_dict(self, block_name):
+        """Get dictionary of input variables for the given block."""
+        block = self.blocks[block_name]
+        print(block.produce_args)
+        inputs = deepcopy(block.produce_args)
+        input_names = self.input_names.get(block_name, dict())
+        inputs_dict = {}
+        for input_value in inputs:
+            name = input_value['name']
+            context_name = input_names.get(name, name)
+            inputs_dict[context_name] = input_value
+        return inputs_dict
+
+    def _get_block_fit_inputs_dict(self, block_name):
+        """Get the list of fit input variables for the given block."""
+        block = self.blocks[block_name]
+        fit_inputs = deepcopy(block.fit_args)
+        input_names = self.input_names.get(block_name, dict())
+        fit_inputs_dict = {}
+        for fit_input in fit_inputs:
+            name = fit_input['name']
+            context_name = input_names.get(name, name)
+            fit_inputs_dict[context_name] = fit_input
+
+        return fit_inputs_dict
+
     def _get_outputs(self, pipeline, outputs):
         """Get the output definitions from the pipeline dictionary.
 
@@ -224,6 +263,45 @@ def _get_str_output(self, output):
 
         raise ValueError('Invalid Output Specification: {}'.format(output))
 
+    def get_inputs(self, fit=True):
+        """Get a dictionary mapping all input variable names required by the
+        pipeline to a dictionary with their specified information.
+
+        Can be specified to include fit arguments.
+
+        Args:
+            fit (bool):
+                Optional argument to include fit arguments or not. Defaults to ``True``.
+
+        Returns:
+            dictionary:
+                A dictionary mapping every input variable's name to a dictionary
+                specifying the information corresponding to that input variable.
+                Each dictionary contains the entry ``name``, as
+                well as any other metadata that may have been included in the
+                pipeline inputs specification.
+
+        Raises:
+            ValueError:
+                If an input specification is not valid.
+            TypeError:
+                If the type of a specification is not an str or an int.
+        """
+        inputs = dict()
+        for block_name in reversed(self.blocks.keys()):  # iterates through pipeline backwards
+            produce_outputs = self._get_block_outputs_dict(block_name)
+            for produce_output_name in produce_outputs.keys():
+                inputs.pop(produce_output_name, None)
+
+            produce_inputs = self._get_block_inputs_dict(block_name)
+            inputs.update(produce_inputs)
+
+            if fit:
+                fit_inputs = self._get_block_fit_inputs_dict(block_name)
+                inputs.update(fit_inputs)
+
+        return inputs
+
     def get_outputs(self, outputs='default'):
         """Get the list of output variables that correspond to the specified outputs.
 
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index f2edc36f..88cb8c44 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -549,6 +549,113 @@ def test_get_output_variables(self):
 
         assert names == ['a_variable']
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_get_inputs_fit(self):
+        expected = {
+            'input': {
+                'name': 'input',
+                'type': 'whatever',
+            },
+            'fit_input': {
+                'name': 'fit_input',
+                'type': 'whatever',
+            },
+            'another_input': {
+                'name': 'another_input',
+                'type': 'another_whatever',
+            }
+
+        }
+
+        pipeline = MLPipeline(['a_primitive', 'another_primitive'])
+
+        pipeline.blocks['a_primitive#1'].produce_args = [
+            {
+                'name': 'input',
+                'type': 'whatever'
+            }
+        ]
+
+        pipeline.blocks['a_primitive#1'].fit_args = [
+            {
+                'name': 'fit_input',
+                'type': 'whatever'
+            }
+        ]
+
+        pipeline.blocks['a_primitive#1'].produce_output = [
+            {
+                'name': 'output',
+                'type': 'another_whatever'
+            }
+        ]
+
+        pipeline.blocks['another_primitive#1'].produce_args = [
+            {
+                'name': 'output',
+                'type': 'another_whatever'
+            },
+            {
+                'name': 'another_input',
+                'type': 'another_whatever'
+            }
+        ]
+
+        inputs = pipeline.get_inputs()
+        assert inputs == expected
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_get_inputs_no_fit(self):
+        expected = {
+            'input': {
+                'name': 'input',
+                'type': 'whatever',
+            },
+            'another_input': {
+                'name': 'another_input',
+                'type': 'another_whatever',
+            }
+
+        }
+
+        pipeline = MLPipeline(['a_primitive', 'another_primitive'])
+
+        pipeline.blocks['a_primitive#1'].produce_args = [
+            {
+                'name': 'input',
+                'type': 'whatever'
+            }
+        ]
+
+        pipeline.blocks['a_primitive#1'].fit_args = [
+            {
+                'name': 'fit_input',
+                'type': 'whatever'
+            }
+        ]
+
+        pipeline.blocks['a_primitive#1'].produce_output = [
+            {
+                'name': 'output',
+                'type': 'another_whatever'
+            }
+        ]
+
+        pipeline.blocks['another_primitive#1'].produce_args = [
+            {
+                'name': 'output',
+                'type': 'another_whatever'
+            },
+            {
+                'name': 'another_input',
+                'type': 'another_whatever'
+            }
+        ]
+
+        inputs = pipeline.get_inputs(fit=False)
+
+        assert inputs == expected
+
     def test_fit(self):
         pass
 

From 0d2108f00b5daa62aa37b6ce715ac7ea01bc0b3f Mon Sep 17 00:00:00 2001
From: Erica Chiu <ejchiu@mit.edu>
Date: Thu, 9 Jan 2020 11:46:21 -0500
Subject: [PATCH 080/160] Remove incorrect docstring

---
 mlblocks/mlpipeline.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index dce30cfe..7f23bf28 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -280,12 +280,6 @@ def get_inputs(self, fit=True):
                 Each dictionary contains the entry ``name``, as
                 well as any other metadata that may have been included in the
                 pipeline inputs specification.
-
-        Raises:
-            ValueError:
-                If an input specification is not valid.
-            TypeError:
-                If the type of a specification is not an str or an int.
         """
         inputs = dict()
         for block_name in reversed(self.blocks.keys()):  # iterates through pipeline backwards

From 4f456bdc3c5cb7200d0ca5e36a0ba05ec1e68e9f Mon Sep 17 00:00:00 2001
From: Erica Chiu <ejchiu@mit.edu>
Date: Mon, 13 Jan 2020 17:09:40 -0500
Subject: [PATCH 081/160] Address comments

---
 mlblocks/mlpipeline.py   | 85 +++++++++++++++++-----------------------
 tests/test_mlpipeline.py | 23 +++++++++++
 2 files changed, 60 insertions(+), 48 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 7f23bf28..fbd5bcf0 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -144,54 +144,35 @@ def _get_pipeline_dict(pipeline, primitives):
 
     def _get_block_outputs(self, block_name):
         """Get the list of output variables for the given block."""
-        block = self.blocks[block_name]
-        outputs = deepcopy(block.produce_output)
-        output_names = self.output_names.get(block_name, dict())
-        for output in outputs:
-            name = output['name']
-            context_name = output_names.get(name, name)
+        outputs = self._get_block_variables(block_name,
+                                            'produce_output',
+                                            self.output_names.get(block_name, dict()))
+        for context_name, output in outputs.items():
             output['variable'] = '{}.{}'.format(block_name, context_name)
 
-        return outputs
-
-    def _get_block_outputs_dict(self, block_name):
-        """Get dictionary of output variables for the given block."""
-        block = self.blocks[block_name]
-        outputs = deepcopy(block.produce_output)
-        output_names = self.output_names.get(block_name, dict())
-        output_dict = {}
-        for output in outputs:
-            name = output['name']
-            context_name = output_names.get(name, name)
-            output_dict[context_name] = output
+        return list(outputs.values())
 
-        return output_dict
+    def _get_block_variables(self, block_name, variables_attr, names):
+        """Get dictionary of variable names to the variable for a given block
 
-    def _get_block_inputs_dict(self, block_name):
-        """Get dictionary of input variables for the given block."""
-        block = self.blocks[block_name]
-        print(block.produce_args)
-        inputs = deepcopy(block.produce_args)
-        input_names = self.input_names.get(block_name, dict())
-        inputs_dict = {}
-        for input_value in inputs:
-            name = input_value['name']
-            context_name = input_names.get(name, name)
-            inputs_dict[context_name] = input_value
-        return inputs_dict
-
-    def _get_block_fit_inputs_dict(self, block_name):
-        """Get the list of fit input variables for the given block."""
+        Args:
+            block_name (str):
+                Name of the block for which to get the specification
+            variables_attr (str):
+                Name of the attribute that has the variables list. It can be
+                `fit_args`, `produce_args` or `produce_output`.
+            names (dict):
+                Dictionary used to translate the variable names.
+        """
         block = self.blocks[block_name]
-        fit_inputs = deepcopy(block.fit_args)
-        input_names = self.input_names.get(block_name, dict())
-        fit_inputs_dict = {}
-        for fit_input in fit_inputs:
-            name = fit_input['name']
-            context_name = input_names.get(name, name)
-            fit_inputs_dict[context_name] = fit_input
+        variables = deepcopy(getattr(block, variables_attr))
+        variable_dict = {}
+        for variable in variables:
+            name = variable['name']
+            context_name = names.get(name, name)
+            variable_dict[context_name] = variable
 
-        return fit_inputs_dict
+        return variable_dict
 
     def _get_outputs(self, pipeline, outputs):
         """Get the output definitions from the pipeline dictionary.
@@ -264,10 +245,11 @@ def _get_str_output(self, output):
         raise ValueError('Invalid Output Specification: {}'.format(output))
 
     def get_inputs(self, fit=True):
-        """Get a dictionary mapping all input variable names required by the
-        pipeline to a dictionary with their specified information.
+        """Get a relation of all the input variables required by this pipeline.
 
-        Can be specified to include fit arguments.
+        The result is a dictionary that maps each variable name with their
+        specified information.
+        Optionally include the fit arguments.
 
         Args:
             fit (bool):
@@ -283,15 +265,22 @@ def get_inputs(self, fit=True):
         """
         inputs = dict()
         for block_name in reversed(self.blocks.keys()):  # iterates through pipeline backwards
-            produce_outputs = self._get_block_outputs_dict(block_name)
+            produce_outputs = self._get_block_variables(block_name,
+                                                        'produce_output',
+                                                        self.output_names.get(block_name, dict()))
+
             for produce_output_name in produce_outputs.keys():
                 inputs.pop(produce_output_name, None)
 
-            produce_inputs = self._get_block_inputs_dict(block_name)
+            produce_inputs = self._get_block_variables(block_name,
+                                                       'produce_args',
+                                                       self.input_names.get(block_name, dict()))
             inputs.update(produce_inputs)
 
             if fit:
-                fit_inputs = self._get_block_fit_inputs_dict(block_name)
+                fit_inputs = self._get_block_variables(block_name,
+                                                       'fit_args',
+                                                       self.input_names.get(block_name, dict()))
                 inputs.update(fit_inputs)
 
         return inputs
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 88cb8c44..4fb779b8 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -549,6 +549,29 @@ def test_get_output_variables(self):
 
         assert names == ['a_variable']
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test__get_block_variables(self):
+        expected = {
+            'name_output': {
+                'name': 'output',
+                'type': 'whatever',
+            }
+        }
+
+        pipeline = MLPipeline(['a_primitive'])
+
+        pipeline.blocks['a_primitive#1'].produce_outputs = [
+            {
+                'name': 'output',
+                'type': 'whatever'
+            }
+        ]
+
+        outputs = pipeline._get_block_variables('a_primitive#1',
+                                                'produce_outputs',
+                                                {'output': 'name_output'})
+        assert outputs == expected
+
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_get_inputs_fit(self):
         expected = {

From 1dd0f372111a775a1d27b2c77641f7fa884a552f Mon Sep 17 00:00:00 2001
From: Erica Chiu <ejchiu@mit.edu>
Date: Tue, 14 Jan 2020 10:19:50 -0500
Subject: [PATCH 082/160] Change indenting

---
 AUTHORS.rst              |  1 +
 mlblocks/mlpipeline.py   | 32 ++++++++++++++++++++------------
 tests/test_mlpipeline.py |  8 +++++---
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/AUTHORS.rst b/AUTHORS.rst
index eb8885c9..7245c735 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -10,3 +10,4 @@ Contributors
 * William Xue <wgxue@mit.edu>
 * Akshay Ravikumar <akshayr@mit.edu>
 * Laura Gustafson <lgustaf@mit.edu>
+* Erica Chiu <ejchiu@mit.edu>
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index fbd5bcf0..35273642 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -144,9 +144,11 @@ def _get_pipeline_dict(pipeline, primitives):
 
     def _get_block_outputs(self, block_name):
         """Get the list of output variables for the given block."""
-        outputs = self._get_block_variables(block_name,
-                                            'produce_output',
-                                            self.output_names.get(block_name, dict()))
+        outputs = self._get_block_variables(
+            block_name,
+            'produce_output',
+            self.output_names.get(block_name, dict())
+        )
         for context_name, output in outputs.items():
             output['variable'] = '{}.{}'.format(block_name, context_name)
 
@@ -265,22 +267,28 @@ def get_inputs(self, fit=True):
         """
         inputs = dict()
         for block_name in reversed(self.blocks.keys()):  # iterates through pipeline backwards
-            produce_outputs = self._get_block_variables(block_name,
-                                                        'produce_output',
-                                                        self.output_names.get(block_name, dict()))
+            produce_outputs = self._get_block_variables(
+                block_name,
+                'produce_output',
+                self.output_names.get(block_name, dict())
+            )
 
             for produce_output_name in produce_outputs.keys():
                 inputs.pop(produce_output_name, None)
 
-            produce_inputs = self._get_block_variables(block_name,
-                                                       'produce_args',
-                                                       self.input_names.get(block_name, dict()))
+            produce_inputs = self._get_block_variables(
+                block_name,
+                'produce_args',
+                self.input_names.get(block_name, dict())
+            )
             inputs.update(produce_inputs)
 
             if fit:
-                fit_inputs = self._get_block_variables(block_name,
-                                                       'fit_args',
-                                                       self.input_names.get(block_name, dict()))
+                fit_inputs = self._get_block_variables(
+                    block_name,
+                    'fit_args',
+                    self.input_names.get(block_name, dict())
+                )
                 inputs.update(fit_inputs)
 
         return inputs
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 4fb779b8..340a3838 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -567,9 +567,11 @@ def test__get_block_variables(self):
             }
         ]
 
-        outputs = pipeline._get_block_variables('a_primitive#1',
-                                                'produce_outputs',
-                                                {'output': 'name_output'})
+        outputs = pipeline._get_block_variables(
+            'a_primitive#1',
+            'produce_outputs',
+            {'output': 'name_output'}
+        )
         assert outputs == expected
 
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)

From 93994e2a0c177fb8bab33f7fe57dd1eaae61a708 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 23 Jan 2020 20:43:15 +0100
Subject: [PATCH 083/160] Add notebook tutorials and examples

---
 Makefile                                      |   4 +
 ...ification.categorical_encoder.xgboost.json |  16 +
 .../mlblocks.examples.ClassPrimitive.json     | 104 ++
 .../mlblocks.examples.function_primitive.json |  86 ++
 .../tutorials/1. Using and MLPipeline.ipynb   | 633 +++++++++++++
 .../2. Finding and Loading a Pipeline.ipynb   | 123 +++
 .... Setting MLPipeline Hyperparameters.ipynb | 430 +++++++++
 .../4. Saving and Loading a Pipeline.ipynb    | 181 ++++
 examples/tutorials/5. Tuning a Pipeline.ipynb | 463 +++++++++
 ...or the best pipeline with BTBSession.ipynb | 895 ++++++++++++++++++
 setup.py                                      |   7 +
 11 files changed, 2942 insertions(+)
 create mode 100644 examples/pipelines/single_table.classification.categorical_encoder.xgboost.json
 create mode 100644 examples/primitives/mlblocks.examples.ClassPrimitive.json
 create mode 100644 examples/primitives/mlblocks.examples.function_primitive.json
 create mode 100644 examples/tutorials/1. Using and MLPipeline.ipynb
 create mode 100644 examples/tutorials/2. Finding and Loading a Pipeline.ipynb
 create mode 100644 examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
 create mode 100644 examples/tutorials/4. Saving and Loading a Pipeline.ipynb
 create mode 100644 examples/tutorials/5. Tuning a Pipeline.ipynb
 create mode 100644 examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb

diff --git a/Makefile b/Makefile
index 6e8dd203..bfc1a5f6 100644
--- a/Makefile
+++ b/Makefile
@@ -72,6 +72,10 @@ clean: clean-build clean-pyc clean-test clean-coverage clean-docs ## remove all
 install: clean-build clean-pyc ## install the package to the active Python's site-packages
 	pip install .
 
+.PHONY: install-examples
+install-examples: clean-build clean-pyc ## install the package and the examples dependencies
+	pip install .[examples]
+
 .PHONY: install-test
 install-test: clean-build clean-pyc ## install the package and test dependencies
 	pip install .[test]
diff --git a/examples/pipelines/single_table.classification.categorical_encoder.xgboost.json b/examples/pipelines/single_table.classification.categorical_encoder.xgboost.json
new file mode 100644
index 00000000..4dca4002
--- /dev/null
+++ b/examples/pipelines/single_table.classification.categorical_encoder.xgboost.json
@@ -0,0 +1,16 @@
+{
+    "metadata": {
+        "data_modality": "single_table",
+        "task_type": "classification"
+    },
+    "validation": {
+        "dataset": "census"
+    },
+    "primitives": [
+        "mlprimitives.custom.preprocessing.ClassEncoder",
+        "mlprimitives.custom.feature_extraction.CategoricalEncoder",
+        "sklearn.impute.SimpleImputer",
+        "xgboost.XGBClassifier",
+        "mlprimitives.custom.preprocessing.ClassDecoder"
+    ]
+}
diff --git a/examples/primitives/mlblocks.examples.ClassPrimitive.json b/examples/primitives/mlblocks.examples.ClassPrimitive.json
new file mode 100644
index 00000000..6c29e51e
--- /dev/null
+++ b/examples/primitives/mlblocks.examples.ClassPrimitive.json
@@ -0,0 +1,104 @@
+{
+    "name": "the_primitive_name",
+    "primitive": "full.python.path.to.AClass",
+    "fit": {
+        "method": "fit",
+        "args": [
+            {
+                "name": "X",
+                "keyword": "optional_name_of_the_fit_method_argument",
+                "description": "each input can be described",
+                "type": "pandas.DataFrame"
+            },
+            {
+                "name": "y",
+                "description": "each input can be described",
+                "default": "default_value_for_this_argument",
+                "type": "pandas.Series"
+            }
+        ]
+    },
+    "produce": {
+        "method": "predict",
+        "args": [
+            {
+                "name": "X",
+                "keyword": "optional_name_of_the_produce_method_argument",
+                "description": "each input can be described",
+                "type": "DataFrame"
+            }
+        ],
+        "output": [
+            {
+                "name": "y",
+                "descrtiption": "each output argument can be described",
+                "type": "Series"
+            }
+        ]
+    },
+    "hyperparameters": {
+        "fixed": {
+            "a_required_hyperparameter": {
+                "descrtiption": "this is a non tunable hyperparameter that needs to be specified by the user because it does not have a default value",
+                "type": "int"
+            },
+            "an_optional_hyperparameter": {
+                "descrtiption": "this is a non tunable hyperparameter that is optional because it has a default value",
+                "type": "int",
+                "default": 1
+            }
+        },
+        "tunable": {
+            "a_simple_range_hyperparameter": {
+                "description": "hyperparameter documentation can be put here",
+                "default": 1,
+                "type": "int",
+                "range": [1, 10]
+            },
+            "a_categorical_hyperparameter_of_type_int": {
+                "description": "Note that it has the field `values` instead of `range`",
+                "default": 1,
+                "type": "int",
+                "values": [1, 3, 7, 10]
+            },
+            "a_categorical_hyperparameter_of_type_str": {
+                "default": "a",
+                "type": "str",
+                "values": ["a", "b", "c"]
+            },
+            "a_multi_type_hyperprameter": {
+                "description": "this is a hyperparameter that allows more than one type",
+                "type": "multitype",
+                "default": "auto",
+                "types": {
+                    "int": {
+                        "description": "documentation can also be included here",
+                        "range": [1, 10]
+                    },
+                    "string": {
+                        "values": ["some", "string", "values"]
+                    }
+                }
+            },
+            "conditional_hyperparameter": {
+                "description": "this is a hyperparameter whose valid values depend on the value of another hyperpameter",
+                "type": "conditional",
+                "condition": "the_name_of_the_other_hyperparameter",
+                "values": {
+                    "a": {
+                        "description": "this hyperparameter definition will be used if the value of the other hyperparameter is `a`",
+                        "type": "int",
+                        "default": 0,
+                        "range": [0, 10]
+                    },
+                    "*": {
+                        "description": "this will be used only if the value does not match any other definition",
+                        "type": "float",
+                        "default": 0.0,
+                        "range": [0.0, 1.0]
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/examples/primitives/mlblocks.examples.function_primitive.json b/examples/primitives/mlblocks.examples.function_primitive.json
new file mode 100644
index 00000000..f3627bd9
--- /dev/null
+++ b/examples/primitives/mlblocks.examples.function_primitive.json
@@ -0,0 +1,86 @@
+{
+    "name": "the_primitive_name",
+    "primitive": "full.python.path.to.a_function",
+    "produce": {
+        "args": [
+            {
+                "name": "X",
+                "keyword": "optional_name_of_the_produce_method_argument",
+                "description": "each input can be described",
+                "type": "DataFrame"
+            }
+        ],
+        "output": [
+            {
+                "descrtiption": "each output argument can be described",
+                "name": "y",
+                "type": "Series"
+            }
+        ]
+    },
+    "hyperparameters": {
+        "fixed": {
+            "a_required_hyperparameter": {
+                "descrtiption": "this is a non tunable hyperparameter that needs to be specified by the user, because it does not have a default value",
+                "type": "int"
+            },
+            "an_optional_hyperparameter": {
+                "descrtiption": "this is a non tunable hyperparameter that is optional, because it has a default value",
+                "type": "int",
+                "default": 1
+            }
+        },
+        "tunable": {
+            "a_simple_range_hyperparameter": {
+                "description": "hyperparameter documentation can be put here",
+                "default": 1,
+                "type": "int",
+                "range": [1, 10]
+            },
+            "a_categorical_hyperparameter_of_type_int": {
+                "description": "Note that it has the filed `values` instead of `range`",
+                "default": 1,
+                "type": "int",
+                "values": [1, 3, 7, 10]
+            },
+            "a_categorical_hyperparameter_of_type_str": {
+                "default": "a",
+                "type": "str",
+                "values": ["a", "b", "c"]
+            },
+            "a_multi_type_hyperprameter": {
+                "description": "this is a hyperparameter that allows more than one type",
+                "type": "multitype",
+                "default": "auto",
+                "types": {
+                    "int": {
+                        "description": "documentation can also be included here",
+                        "range": [1, 10]
+                    },
+                    "string": {
+                        "values": ["some", "string", "values"]
+                    }
+                }
+            },
+            "conditional_hyperparameter": {
+                "description": "this is a hyperparameter whose valid values depend on the value of another hyperpameter",
+                "type": "conditional",
+                "condition": "the_name_of_the_other_hyperparameter",
+                "values": {
+                    "a": {
+                        "description": "this hyperparameter definition will be used if the value of the other hyperparameter is `a`",
+                        "type": "int",
+                        "default": 0,
+                        "range": [0, 10]
+                    },
+                    "*": {
+                        "description": "this will be used only if the value does not match any other definition",
+                        "type": "float",
+                        "default": 0.0,
+                        "range": [0.0, 1.0]
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/examples/tutorials/1. Using and MLPipeline.ipynb b/examples/tutorials/1. Using and MLPipeline.ipynb
new file mode 100644
index 00000000..733fb42d
--- /dev/null
+++ b/examples/tutorials/1. Using and MLPipeline.ipynb	
@@ -0,0 +1,633 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using an MLPipeline\n",
+    "\n",
+    "In this short guide we will go over the basic MLPipeline functionality.\n",
+    "\n",
+    "We will:\n",
+    "\n",
+    "1. Load a demo dataset.\n",
+    "2. Build a pipeline.\n",
+    "3. Explore the pipeline primitives, inputs and outputs.\n",
+    "4. Fit the pipeline to the dataset.\n",
+    "5. Make predictions using the fitted pipeline.\n",
+    "6. Evaluate the pipeline performance."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load the Dataset\n",
+    "\n",
+    "The first step will be to load the Census dataset using the function provided by mlprimitives"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlprimitives.datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset('census')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This version of the Census dataset is prepared as a Classification (Supervised) Problem,\n",
+    "and has an input matrix `X` and an expected outcome `y` array."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Adult Census dataset.\n",
+      "\n",
+      "    Predict whether income exceeds $50K/yr based on census data. Also known as \"Adult\" dataset.\n",
+      "\n",
+      "    Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean\n",
+      "    records was extracted using the following conditions: ((AAGE>16) && (AGI>100) &&\n",
+      "    (AFNLWGT>1)&& (HRSWK>0))\n",
+      "\n",
+      "    Prediction task is to determine whether a person makes over 50K a year.\n",
+      "\n",
+      "    source: \"UCI\n",
+      "    sourceURI: \"https://archive.ics.uci.edu/ml/datasets/census+income/"\n",
+      "    \n",
+      "Data Modality: single_table\n",
+      "Task Type: classification\n",
+      "Task Subtype: binary\n",
+      "Data shape: (32561, 14)\n",
+      "Target shape: (32561,)\n",
+      "Metric: accuracy_score\n",
+      "Extras: \n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset.describe()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The data from the dataset can explored by looking at its `.data` and `.target` attributes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>age</th>\n",
+       "      <th>workclass</th>\n",
+       "      <th>fnlwgt</th>\n",
+       "      <th>education</th>\n",
+       "      <th>education-num</th>\n",
+       "      <th>marital-status</th>\n",
+       "      <th>occupation</th>\n",
+       "      <th>relationship</th>\n",
+       "      <th>race</th>\n",
+       "      <th>sex</th>\n",
+       "      <th>capital-gain</th>\n",
+       "      <th>capital-loss</th>\n",
+       "      <th>hours-per-week</th>\n",
+       "      <th>native-country</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>39</td>\n",
+       "      <td>State-gov</td>\n",
+       "      <td>77516</td>\n",
+       "      <td>Bachelors</td>\n",
+       "      <td>13</td>\n",
+       "      <td>Never-married</td>\n",
+       "      <td>Adm-clerical</td>\n",
+       "      <td>Not-in-family</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>2174</td>\n",
+       "      <td>0</td>\n",
+       "      <td>40</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>50</td>\n",
+       "      <td>Self-emp-not-inc</td>\n",
+       "      <td>83311</td>\n",
+       "      <td>Bachelors</td>\n",
+       "      <td>13</td>\n",
+       "      <td>Married-civ-spouse</td>\n",
+       "      <td>Exec-managerial</td>\n",
+       "      <td>Husband</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>38</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>215646</td>\n",
+       "      <td>HS-grad</td>\n",
+       "      <td>9</td>\n",
+       "      <td>Divorced</td>\n",
+       "      <td>Handlers-cleaners</td>\n",
+       "      <td>Not-in-family</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>40</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>53</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>234721</td>\n",
+       "      <td>11th</td>\n",
+       "      <td>7</td>\n",
+       "      <td>Married-civ-spouse</td>\n",
+       "      <td>Handlers-cleaners</td>\n",
+       "      <td>Husband</td>\n",
+       "      <td>Black</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>40</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>28</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>338409</td>\n",
+       "      <td>Bachelors</td>\n",
+       "      <td>13</td>\n",
+       "      <td>Married-civ-spouse</td>\n",
+       "      <td>Prof-specialty</td>\n",
+       "      <td>Wife</td>\n",
+       "      <td>Black</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>40</td>\n",
+       "      <td>Cuba</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   age          workclass  fnlwgt   education  education-num  \\\n",
+       "0   39          State-gov   77516   Bachelors             13   \n",
+       "1   50   Self-emp-not-inc   83311   Bachelors             13   \n",
+       "2   38            Private  215646     HS-grad              9   \n",
+       "3   53            Private  234721        11th              7   \n",
+       "4   28            Private  338409   Bachelors             13   \n",
+       "\n",
+       "        marital-status          occupation    relationship    race      sex  \\\n",
+       "0        Never-married        Adm-clerical   Not-in-family   White     Male   \n",
+       "1   Married-civ-spouse     Exec-managerial         Husband   White     Male   \n",
+       "2             Divorced   Handlers-cleaners   Not-in-family   White     Male   \n",
+       "3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   \n",
+       "4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   \n",
+       "\n",
+       "   capital-gain  capital-loss  hours-per-week  native-country  \n",
+       "0          2174             0              40   United-States  \n",
+       "1             0             0              13   United-States  \n",
+       "2             0             0              40   United-States  \n",
+       "3             0             0              40   United-States  \n",
+       "4             0             0              40            Cuba  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset.data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K'], dtype=object)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset.target[0:5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The dataset data can also be splitted in multipe parts for cross validation using the `dataset.get_splits` method.\n",
+    "\n",
+    "For this demo we will be making only one split, which is equivalent to a simple train/test holdout partitioning."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train, X_test, y_train, y_test = dataset.get_splits(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(24420, 14)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(8141, 14)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_test.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Build a pipeline\n",
+    "\n",
+    "Once we have the dataset we will build a pipeline that works with it.\n",
+    "\n",
+    "In this case, we will be creating a short pipeline that uses the following primitives:\n",
+    "\n",
+    "- `ClassEncoder` from `mlprimitives`, which encodes the target variable `y` as integers.\n",
+    "- `CategoricaEncoder` from `mlprimitives`, which encodes all the categorical variables from the feature matrix `X`\n",
+    "  using one-hot encoding.\n",
+    "- `SimpleImputer` from `sklearn`, which imputes any null values that may exist in the feature matrix `X`\n",
+    "- `XGBClassifier` from `xgboost`, which learns to predict the target variable `y` sing the feature matrix `X`.\n",
+    "- `ClassDecoder` from `mlprimitives`, which reverts the `ClassEncoder` transformation to return the original\n",
+    "  target labels instead of integers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlblocks import MLPipeline\n",
+    "\n",
+    "primitives = [\n",
+    "    'mlprimitives.custom.preprocessing.ClassEncoder',\n",
+    "    'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
+    "    'sklearn.impute.SimpleImputer',\n",
+    "    'xgboost.XGBClassifier',\n",
+    "    'mlprimitives.custom.preprocessing.ClassDecoder'\n",
+    "]\n",
+    "pipeline = MLPipeline(primitives)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Explore the Pipeline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Primitives"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see the primitives included in this pipeline by having a look at its `primitives` attribute."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['mlprimitives.custom.preprocessing.ClassEncoder',\n",
+       " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
+       " 'sklearn.impute.SimpleImputer',\n",
+       " 'xgboost.XGBClassifier',\n",
+       " 'mlprimitives.custom.preprocessing.ClassDecoder']"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipeline.primitives"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Inputs\n",
+    "\n",
+    "We can also see the inputs of the pipeline using the `get_inputs` method.\n",
+    "\n",
+    "This will traverse the pipeline execution graph and show all the variables that need to be\n",
+    "provided by the user in order to fit this pipeline."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'X': {'name': 'X', 'type': 'DataFrame'},\n",
+       " 'y': {'name': 'y', 'type': 'ndarray'}}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipeline.get_inputs()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Alternatively, we can pass the `fit=False` argument, which will give us the variables needed\n",
+    "in order to make predictions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'X': {'name': 'X', 'type': 'DataFrame'},\n",
+       " 'y': {'name': 'y', 'default': None, 'type': 'ndarray'}}"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipeline.get_inputs(fit=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note how the `fit` method expects two variables `X` and `y`, while the `predict`\n",
+    "method only needs `X`, as the `y` variable has a default value of `None`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Outputs\n",
+    "\n",
+    "Equally, we can see the outputs that the pipeline will return when used to make predictions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'name': 'y',\n",
+       "  'type': 'ndarray',\n",
+       "  'variable': 'mlprimitives.custom.preprocessing.ClassDecoder#1.y'}]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipeline.get_outputs()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fit the Pipeline to the Dataset\n",
+    "\n",
+    "Now that the pipeline is ready and we know its inputs and outputs, we can fit it to the\n",
+    "dataset by passing the training `X` and `y` variables to its `fit` method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Make Predictions\n",
+    "\n",
+    "After the pipelines finished fitting, we can try to predict the `y_test` array values by\n",
+    "passing the `X_test` matrix to the `pipeline.predict` method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "predictions = pipeline.predict(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([' >50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "predictions[0:5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluating the pipeline performance\n",
+    "\n",
+    "Now we can compare the predicted array with the actual test array to see how well\n",
+    "our pipeline performed.\n",
+    "\n",
+    "This can be done using the `dataset.score` method, which provides a suitable scoring\n",
+    "function for this kind of data and problem.\n",
+    "In this case, the dataset is just computing the accuracy score."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.8602137329566393"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset.score(y_test, predictions)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb
new file mode 100644
index 00000000..a94c48bc
--- /dev/null
+++ b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb	
@@ -0,0 +1,123 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Finding and Loading a Pipeline\n",
+    "\n",
+    "In this short tutorial we will show you how to search for pipelines suitable to solve\n",
+    "your prediction problem."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to find a suitable pipeline, the first thing we need is to identify\n",
+    "the type of problem (data modality + task type) that we are facing.\n",
+    "\n",
+    "This is a full list of current data modalities and task types that we cover:\n",
+    "\n",
+    "| Problem Type                         | Data Modality | Task Type               |\n",
+    "|:-------------------------------------|:--------------|:------------------------|\n",
+    "| Single Table Classification          | single_table  | classification          |\n",
+    "| Single Table Regression              | single_table  | regression              |\n",
+    "| Single Table Collaborative Filtering | single_table  | collaborative_filtering |\n",
+    "| Multi Table Classification           | multi_table   | classification          |\n",
+    "| Multi Table Regression               | multi_table   | regression              |\n",
+    "| Time Series Classification           | timeseries    | classification          |\n",
+    "| Time Series Regression               | timeseries    | regression              |\n",
+    "| Time Series Forecasting              | timeseries    | forecasting             |\n",
+    "| Time Series Anomaly Detection        | timeseries    | anomaly_detection       |\n",
+    "| Image Classification                 | image         | classification          |\n",
+    "| Image Regression                     | image         | regression              |\n",
+    "| Graph Link Prediction                | graph         | link_prediction         |\n",
+    "| Graph Vertex Nomination              | graph         | vertex_nomination       |\n",
+    "| Graph Community Detection            | graph         | community_detection     |\n",
+    "| Graph Matching                       | graph         | graph_matching          |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once we have identified our data modality and task type we can use the\n",
+    "`mlblocks.discovery.find_pipelines` function to find all the pipelines\n",
+    "that support this particular problem type.\n",
+    "\n",
+    "For example, if we are looking for a pipeline to work on Image Classification\n",
+    "we will do the following query."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['image.classification.hog.random_forest',\n",
+       " 'image.classification.hog.xgboost',\n",
+       " 'image.classification.resnet50.xgboost']"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from mlblocks.discovery import find_pipelines\n",
+    "\n",
+    "filters = {\n",
+    "    'metadata.data_modality': 'image',\n",
+    "    'metadata.task_type': 'classification',\n",
+    "}\n",
+    "\n",
+    "find_pipelines(filters=filters)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After finding and choosing a pipeline, we can load it as an `MLPipeline`\n",
+    "by passing its name to the `MLPipeline`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlblocks import MLPipeline\n",
+    "\n",
+    "pipeline = MLPipeline('image.classification.resnet50.xgboost')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
new file mode 100644
index 00000000..29f60a8f
--- /dev/null
+++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb	
@@ -0,0 +1,430 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2. Setting MLPipeline Hyperparameters\n",
+    "\n",
+    "In this short guide we will see how to modify the hyperparameters\n",
+    "of an MLPipeline in order to modify its behavior or performance.\n",
+    "\n",
+    "Note that some steps are not explained for simplicity. Full details\n",
+    "about them can be found in the previous parts of the tutorial.\n",
+    "\n",
+    "We will:\n",
+    "\n",
+    "1. Load a dataset and a Pipeline.\n",
+    "2. Explore the pipeline hyperparamters.\n",
+    "3. Reload the pipeline with different hyperparameters.\n",
+    "4. Evaluate the pipeline performance on the dataset.\n",
+    "5. Set different pipeline hyperparameters.\n",
+    "6. Re-evaluate the pipeline performance on the dataset."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load the Dataset and the Pipeline\n",
+    "\n",
+    "The first step will be to load the dataset and the pipeline that we will be using."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlprimitives.datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset('census')\n",
+    "X_train, X_test, y_train, y_test = dataset.get_splits(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlblocks import MLPipeline\n",
+    "\n",
+    "pipeline = MLPipeline('single_table.classification.categorical_encoder.xgboost')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Explore the Pipeline Hyperparameters"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once we have loaded the pipeline, we can see the hyperparameters that it is using by\n",
+    "calling its `get_hyperparameters` method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n",
+       " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n",
+       "  'copy': True,\n",
+       "  'features': 'auto',\n",
+       "  'max_unique_ratio': 0,\n",
+       "  'max_labels': 0},\n",
+       " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n",
+       "  'fill_value': None,\n",
+       "  'verbose': False,\n",
+       "  'copy': True,\n",
+       "  'strategy': 'mean'},\n",
+       " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n",
+       "  'n_estimators': 100,\n",
+       "  'max_depth': 3,\n",
+       "  'learning_rate': 0.1,\n",
+       "  'gamma': 0,\n",
+       "  'min_child_weight': 1},\n",
+       " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipeline.get_hyperparameters()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This will return us a dictionary that contains one entry for each step in the pipeline.\n",
+    "Each entry will also be a dictionary, indicating the names and the values of the hyperparameters of that step.\n",
+    "\n",
+    "**NOTE** that here we see the names of the pipeline steps, which are the primitive names with a numerical suffix that allows us to tell the difference between multiple steps that use the same primitive. \n",
+    "\n",
+    "Alternatively, for better compatibility with tuning systems like [BTB](https://github.com/HDI-Project/BTB)\n",
+    "that work with flat, one-level, dictionaries, the argument `flat=True` can be passed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+       "  'keep'): False,\n",
+       " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'copy'): True,\n",
+       " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+       "  'features'): 'auto',\n",
+       " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+       "  'max_unique_ratio'): 0,\n",
+       " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+       "  'max_labels'): 0,\n",
+       " ('sklearn.impute.SimpleImputer#1', 'missing_values'): nan,\n",
+       " ('sklearn.impute.SimpleImputer#1', 'fill_value'): None,\n",
+       " ('sklearn.impute.SimpleImputer#1', 'verbose'): False,\n",
+       " ('sklearn.impute.SimpleImputer#1', 'copy'): True,\n",
+       " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+       " ('xgboost.XGBClassifier#1', 'n_jobs'): -1,\n",
+       " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
+       " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
+       " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
+       " ('xgboost.XGBClassifier#1', 'gamma'): 0,\n",
+       " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipeline.get_hyperparameters(flat=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This will return us the same information as before, but organized a single one-level\n",
+    "dictionary where each key is a `tuple` containing both the name of the step and the hyperparameter."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setting Pipeline hyperparameter values\n",
+    "\n",
+    "We can set some different hyperparameter values when loading the pipeline by adding the\n",
+    "`init_params` argument to `MLPipeline`.\n",
+    "\n",
+    "The `init_params` has to be a dictionary where each entry corresponds to the name of one of the\n",
+    "pipeline steps and each value is another dictionary indicating the hyperparameter values that we\n",
+    "want to use on that step.\n",
+    "\n",
+    "As an example, we will set a different imputer strategy and a different xgboost max dempt."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "init_params = {\n",
+    "    'sklearn.impute.SimpleImputer#1': {\n",
+    "        'strategy': 'median'\n",
+    "    },\n",
+    "    'xgboost.XGBClassifier#1': {\n",
+    "        'max_depth': 4\n",
+    "    }\n",
+    "}\n",
+    "pipeline = MLPipeline(\n",
+    "    'single_table.classification.categorical_encoder.xgboost',\n",
+    "    init_params=init_params\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can now see how the hyperparameters are different than before."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n",
+       " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n",
+       "  'copy': True,\n",
+       "  'features': 'auto',\n",
+       "  'max_unique_ratio': 0,\n",
+       "  'max_labels': 0},\n",
+       " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n",
+       "  'fill_value': None,\n",
+       "  'verbose': False,\n",
+       "  'copy': True,\n",
+       "  'strategy': 'median'},\n",
+       " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n",
+       "  'max_depth': 4,\n",
+       "  'n_estimators': 100,\n",
+       "  'learning_rate': 0.1,\n",
+       "  'gamma': 0,\n",
+       "  'min_child_weight': 1},\n",
+       " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipeline.get_hyperparameters()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluate the Pipeline performance\n",
+    "\n",
+    "We can now evaluate the pipeline performance to see what results these\n",
+    "hyperparameters produce."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.8647586291610367"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipeline.fit(X_train, y_train)\n",
+    "y_pred = pipeline.predict(X_test)\n",
+    "\n",
+    "dataset.score(y_test, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setting hyperparameter values\n",
+    "\n",
+    "Another way of setting the pipeline hyperparameters without having to recreate it\n",
+    "from scratch, is to use its `set_hyperparameters` method.\n",
+    "\n",
+    "In this case, we will change the CategoricalEncoder `max_labels` and the xgboost `learning_rate`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hyperparameters = {\n",
+    "    'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {\n",
+    "        'max_labels': 10\n",
+    "    },\n",
+    "    'xgboost.XGBClassifier#1': {\n",
+    "        'learning_rate': 0.3\n",
+    "    }\n",
+    "}\n",
+    "pipeline.set_hyperparameters(hyperparameters)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Alternatively, the hyperparameters can be set using the `flat` format:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hyperparameters = {\n",
+    "    ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 10,\n",
+    "    ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3\n",
+    "}\n",
+    "pipeline.set_hyperparameters(hyperparameters)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And we can see how these hyperparameters now are different than before:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n",
+       " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n",
+       "  'copy': True,\n",
+       "  'features': 'auto',\n",
+       "  'max_unique_ratio': 0,\n",
+       "  'max_labels': 10},\n",
+       " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n",
+       "  'fill_value': None,\n",
+       "  'verbose': False,\n",
+       "  'copy': True,\n",
+       "  'strategy': 'median'},\n",
+       " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n",
+       "  'max_depth': 4,\n",
+       "  'n_estimators': 100,\n",
+       "  'learning_rate': 0.3,\n",
+       "  'gamma': 0,\n",
+       "  'min_child_weight': 1},\n",
+       " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipeline.get_hyperparameters()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluate the Pipeline performance\n",
+    "\n",
+    "We can now evaluate again the pipeline performance and see how the hyperparameter\n",
+    "change affected the pipeline performance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.870531875690947"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipeline.fit(X_train, y_train)\n",
+    "y_pred = pipeline.predict(X_test)\n",
+    "\n",
+    "dataset.score(y_test, y_pred)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb
new file mode 100644
index 00000000..193daaf3
--- /dev/null
+++ b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb	
@@ -0,0 +1,181 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Saving and Loading a Pipeline\n",
+    "\n",
+    "This short guide shows how serialize a Pipeline into a file and later on load it\n",
+    "to make predictions.\n",
+    "\n",
+    "Note that some steps are not explained for simplicity. Full details\n",
+    "about them can be found in the previous parts of the tutorial.\n",
+    "\n",
+    "We will:\n",
+    "\n",
+    "1. Load and fit a pipeline to a dataset\n",
+    "2. Save the pipeline to a file.\n",
+    "3. Load the pipeline as a new object.\n",
+    "4. Make predictions using the new pipeline object."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fit the pipeline\n",
+    "\n",
+    "The first step will be to load and fit the pipeline to the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlprimitives.datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset('census')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train, X_test, y_train, y_test = dataset.get_splits(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlblocks import MLPipeline\n",
+    "\n",
+    "pipeline = MLPipeline('single_table.classification.categorical_encoder.xgboost')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save the Pipeline\n",
+    "\n",
+    "Once the pipeline is fit and ready to make predictions we can store it in a file.\n",
+    "We will do so using [pickle](https://docs.python.org/3/library/pickle.html)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "\n",
+    "with open('pipeline.pkl', 'wb') as f:\n",
+    "    pickle.dump(pipeline, f)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load the Pipeline\n",
+    "\n",
+    "The saved pipeline can then be moved to another system where we can load it back to\n",
+    "memory using pickle again."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('pipeline.pkl', 'rb') as f:\n",
+    "    loaded_pipeline = pickle.load(f)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**IMPORTANT**: All the dependencies need to also be installed in the system that is loading the pipeline. This includes **MLBlocks** and **MLPrimitives** or any other libraries required by the pipeline primitives."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Make Predictions\n",
+    "\n",
+    "Once the pipeline is loaded it is ready to make predictions again"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pred = pipeline.predict(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([' >50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pred[0:5]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tutorials/5. Tuning a Pipeline.ipynb b/examples/tutorials/5. Tuning a Pipeline.ipynb
new file mode 100644
index 00000000..8dbc4366
--- /dev/null
+++ b/examples/tutorials/5. Tuning a Pipeline.ipynb	
@@ -0,0 +1,463 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tuning a Pipeline\n",
+    "\n",
+    "This short guide shows how tune a Pipeline using a [BTB](https://github.com/HDI-Project/BTB) Tuner.\n",
+    "\n",
+    "Note that some steps are not explained for simplicity. Full details\n",
+    "about them can be found in the previous parts of the tutorial.\n",
+    "\n",
+    "Here we will:\n",
+    "1. Load a dataset and a pipeline\n",
+    "2. Explore the pipeline tunable hyperparameters\n",
+    "3. Write a scoring function\n",
+    "4. Build a BTB Tunable and BTB Tuner.\n",
+    "5. Write a tuning loop"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load dataset and the pipeline\n",
+    "\n",
+    "The first step will be to load the dataset that we were using in previous tutorials."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlprimitives.datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset('census')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And load a suitable pipeline.\n",
+    "\n",
+    "Note how in this case we are using the variable name `template` instead of `pipeline`,\n",
+    "because this will only be used as a template for the pipelines that we will create\n",
+    "and evaluate during the later tuning loop."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlblocks import MLPipeline\n",
+    "\n",
+    "template = MLPipeline('single_table.classification.categorical_encoder.xgboost')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Explore the pipeline tunable hyperparameters"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once we have loaded the pipeline, we can now extract the hyperparameters that we will tune\n",
+    "by calling the `get_tunable_hyperparameters` method.\n",
+    "\n",
+    "In this case we will call it using `flat=True` to obtain the hyperparameters in a format\n",
+    "that is compatible with BTB."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tunable_hyperparameters = template.get_tunable_hyperparameters(flat=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+       "  'max_labels'): {'type': 'int', 'default': 0, 'range': [0, 100]},\n",
+       " ('sklearn.impute.SimpleImputer#1', 'strategy'): {'type': 'str',\n",
+       "  'default': 'mean',\n",
+       "  'values': ['mean', 'median', 'most_frequent', 'constant']},\n",
+       " ('xgboost.XGBClassifier#1', 'n_estimators'): {'type': 'int',\n",
+       "  'default': 100,\n",
+       "  'range': [10, 1000]},\n",
+       " ('xgboost.XGBClassifier#1', 'max_depth'): {'type': 'int',\n",
+       "  'default': 3,\n",
+       "  'range': [3, 10]},\n",
+       " ('xgboost.XGBClassifier#1', 'learning_rate'): {'type': 'float',\n",
+       "  'default': 0.1,\n",
+       "  'range': [0, 1]},\n",
+       " ('xgboost.XGBClassifier#1', 'gamma'): {'type': 'float',\n",
+       "  'default': 0,\n",
+       "  'range': [0, 1]},\n",
+       " ('xgboost.XGBClassifier#1', 'min_child_weight'): {'type': 'int',\n",
+       "  'default': 1,\n",
+       "  'range': [1, 10]}}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tunable_hyperparameters"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Write a scoring function\n",
+    "\n",
+    "To tune the pipeline we will need to evaluate its performance multiple times with different hyperparameters.\n",
+    "\n",
+    "For this reason, we will start by writing a scoring function that will expect only one\n",
+    "input, the hyperparameters dictionary, and evaluate the performance of the pipeline using them.\n",
+    "\n",
+    "In this case, the evaluation will be done using 5-fold cross validation based on the `get_splits`\n",
+    "method from the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "def cross_validate(hyperparameters=None):\n",
+    "    scores = []\n",
+    "    for X_train, X_test, y_train, y_test in dataset.get_splits(5):\n",
+    "        pipeline = MLPipeline(template.to_dict())  # Make a copy of the template\n",
+    "        if hyperparameters:\n",
+    "            pipeline.set_hyperparameters(hyperparameters)\n",
+    "\n",
+    "        pipeline.fit(X_train, y_train)\n",
+    "        y_pred = pipeline.predict(X_test)\n",
+    "        \n",
+    "        scores.append(dataset.score(y_test, y_pred))\n",
+    "        \n",
+    "    return np.mean(scores)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "By calling this function without any arguments we will obtain the score obtained\n",
+    "with the default hyperparameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.8639171383183359"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "default_score = cross_validate()\n",
+    "default_score"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Optionally, we can certify that by passing a hyperparameters dictionary the new hyperparameters\n",
+    "will be used, resulting on a different score."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.8686773872402614"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "hyperparameters = {\n",
+    "    ('xgboost.XGBClassifier#1', 'max_depth'): 4\n",
+    "}\n",
+    "cross_validate(hyperparameters)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a BTB Tunable\n",
+    "\n",
+    "The next step is to create the BTB Tunable instance that will be tuned by the BTB Tuner.\n",
+    "\n",
+    "For this we will use its `from_dict` method, passing our hyperparameters dict."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from btb.tuning import Tunable\n",
+    "\n",
+    "tunable = Tunable.from_dict(tunable_hyperparameters)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create the BTB Tuner\n",
+    "\n",
+    "After creating the Tunable, we need to create a Tuner to tune it.\n",
+    "\n",
+    "In this case we will use the GPTuner, a Meta-model based tuner that uses a Gaussian Process Regressor\n",
+    "for the optimization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from btb.tuning import GPTuner\n",
+    "\n",
+    "tuner = GPTuner(tunable)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Optionally, since we already know the score obtained by the default arguments and\n",
+    "these have a high probability of being already decent, we will inform the tuner\n",
+    "about their performance.\n",
+    "\n",
+    "In order to obtain the default hyperparameters used before we can either call\n",
+    "the template `get_hyperparameters(flat=True)` method, the `tunable.get_defaults()`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+       "  'max_labels'): 0,\n",
+       " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+       " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
+       " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
+       " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
+       " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n",
+       " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "defaults = tunable.get_defaults()\n",
+    "defaults"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tuner.record(defaults, default_score)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Start the Tuning loop\n",
+    "\n",
+    "Once we have the tuner ready we can the tuning loop.\n",
+    "\n",
+    "During this loop we will:\n",
+    "\n",
+    "1. Ask the tuner for a new hyperparameter proposal\n",
+    "2. Run the `cross_validate` function to evaluate these hyperparameters\n",
+    "3. Record the obtained score back to the tuner.\n",
+    "4. If the obtained score is better than the previous one, store the proposal."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "scoring pipeline 1\n",
+      "scoring pipeline 2\n",
+      "New best found: 0.8722706212975673\n",
+      "scoring pipeline 3\n",
+      "scoring pipeline 4\n",
+      "scoring pipeline 5\n",
+      "scoring pipeline 6\n",
+      "scoring pipeline 7\n",
+      "scoring pipeline 8\n",
+      "scoring pipeline 9\n",
+      "scoring pipeline 10\n"
+     ]
+    }
+   ],
+   "source": [
+    "best_score = default_score\n",
+    "best_proposal = defaults\n",
+    "\n",
+    "for iteration in range(10):\n",
+    "    print(\"scoring pipeline {}\".format(iteration + 1))\n",
+    "    \n",
+    "    proposal = tuner.propose()\n",
+    "    score = cross_validate(proposal)\n",
+    "    \n",
+    "    tuner.record(proposal, score)\n",
+    "    \n",
+    "    if score > best_score:\n",
+    "        print(\"New best found: {}\".format(score))\n",
+    "        best_score = score\n",
+    "        best_proposal = proposal"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After the loop has finished, the best proposal will be stored in the `best_proposal` variable,\n",
+    "which can be used to generate a new pipeline instance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+       "  'max_labels'): 40,\n",
+       " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+       " ('xgboost.XGBClassifier#1', 'n_estimators'): 119,\n",
+       " ('xgboost.XGBClassifier#1', 'max_depth'): 4,\n",
+       " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1971742459927317,\n",
+       " ('xgboost.XGBClassifier#1', 'gamma'): 0.22575517380871246,\n",
+       " ('xgboost.XGBClassifier#1', 'min_child_weight'): 4}"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "best_proposal"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_pipeline = MLPipeline(template.to_dict())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_pipeline.set_hyperparameters(best_proposal)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_pipeline.fit(dataset.data, dataset.target)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb
new file mode 100644
index 00000000..a1f0c0f4
--- /dev/null
+++ b/examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb	
@@ -0,0 +1,895 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Selecting and Tuning pipelines\n",
+    "\n",
+    "This guide shows you how to search for multiple pipelines for your problem\n",
+    "and later on use a [BTBSession](https://hdi-project.github.io/BTB/api/btb.session.html#btb.session.BTBSession)\n",
+    "to select and tune the best one.\n",
+    "\n",
+    "Note that some steps are not explained for simplicity. Full details\n",
+    "about them can be found in the previous parts of the tutorial.\n",
+    "\n",
+    "Here we will:\n",
+    "\n",
+    "1. Load a dataset\n",
+    "2. Search and load suitable templates\n",
+    "3. Write a scoring function\n",
+    "4. Build a BTBSession for our templates\n",
+    "5. Run the session to find the best pipeline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load the Dataset\n",
+    "\n",
+    "The first step will be to load the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlprimitives.datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset('census')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Adult Census dataset.\n",
+      "\n",
+      "    Predict whether income exceeds $50K/yr based on census data. Also known as \"Adult\" dataset.\n",
+      "\n",
+      "    Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean\n",
+      "    records was extracted using the following conditions: ((AAGE>16) && (AGI>100) &&\n",
+      "    (AFNLWGT>1)&& (HRSWK>0))\n",
+      "\n",
+      "    Prediction task is to determine whether a person makes over 50K a year.\n",
+      "\n",
+      "    source: \"UCI\n",
+      "    sourceURI: \"https://archive.ics.uci.edu/ml/datasets/census+income/"\n",
+      "    \n",
+      "Data Modality: single_table\n",
+      "Task Type: classification\n",
+      "Task Subtype: binary\n",
+      "Data shape: (32561, 14)\n",
+      "Target shape: (32561,)\n",
+      "Metric: accuracy_score\n",
+      "Extras: \n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset.describe()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Find and load suitable Templates"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will be using the `mlblocks.discovery.find_pipelines` function to search\n",
+    "for compatible pipelines.\n",
+    "\n",
+    "In this case, we will be looking for `single_table/classification` pipelines."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlblocks.discovery import find_pipelines\n",
+    "\n",
+    "filters = {\n",
+    "    'metadata.data_modality': 'single_table',\n",
+    "    'metadata.task_type': 'classification'\n",
+    "}\n",
+    "templates = find_pipelines(filters=filters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['single_table.classification.categorical_encoder.logit',\n",
+       " 'single_table.classification.categorical_encoder.random_forest',\n",
+       " 'single_table.classification.categorical_encoder.xgboost',\n",
+       " 'single_table.classification.mlprimitives.logit',\n",
+       " 'single_table.classification.mlprimitives.random_forest',\n",
+       " 'single_table.classification.mlprimitives.xgboost',\n",
+       " 'single_table.classification.mlprimitives_text.xgboost']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "templates"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And we will create a dictionary with MLPipeline instances that will be used as tempaltes for our tuning."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlblocks import MLPipeline\n",
+    "\n",
+    "templates_dict = {\n",
+    "    template: MLPipeline(template)\n",
+    "    for template in templates\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<mlblocks.mlpipeline.MLPipeline at 0x7fd038c14e80>"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "templates_dict['single_table.classification.mlprimitives.xgboost']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a scoring function\n",
+    "\n",
+    "In order to use a `BTBSession` we will need a function that is able to score a proposal,\n",
+    "which will always be a pair of template name and proposed hyperparameters.\n",
+    "\n",
+    "In this case, the evaluation will be done using 5-fold cross validation over our dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "def cross_validate(template_name, hyperparameters=None):\n",
+    "    template = templates_dict[template_name]\n",
+    "    scores = []\n",
+    "    for X_train, X_test, y_train, y_test in dataset.get_splits(5):\n",
+    "        pipeline = MLPipeline(template.to_dict())  # Make a copy of the template\n",
+    "        if hyperparameters:\n",
+    "            pipeline.set_hyperparameters(hyperparameters)\n",
+    "\n",
+    "        pipeline.fit(X_train, y_train)\n",
+    "        y_pred = pipeline.predict(X_test)\n",
+    "        \n",
+    "        scores.append(dataset.score(y_test, y_pred))\n",
+    "        \n",
+    "    return np.mean(scores)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup the BTBSession"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will create another dictionary with the tunable hyperparameters of each template.\n",
+    "This will be used by the BTBSession to know how to tune each template."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tunables = {\n",
+    "    name: template.get_tunable_hyperparameters(flat=True)\n",
+    "    for name, template in templates_dict.items()\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+       "  'max_labels'): {'type': 'int', 'default': 0, 'range': [0, 100]},\n",
+       " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "  'lowercase'): {'type': 'bool', 'default': True},\n",
+       " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "  'binary'): {'type': 'bool', 'default': True},\n",
+       " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "  'max_features'): {'type': 'int', 'default': 1000, 'range': [1, 10000]},\n",
+       " ('sklearn.impute.SimpleImputer#1', 'strategy'): {'type': 'str',\n",
+       "  'default': 'mean',\n",
+       "  'values': ['mean', 'median', 'most_frequent', 'constant']},\n",
+       " ('xgboost.XGBClassifier#1', 'n_estimators'): {'type': 'int',\n",
+       "  'default': 100,\n",
+       "  'range': [10, 1000]},\n",
+       " ('xgboost.XGBClassifier#1', 'max_depth'): {'type': 'int',\n",
+       "  'default': 3,\n",
+       "  'range': [3, 10]},\n",
+       " ('xgboost.XGBClassifier#1', 'learning_rate'): {'type': 'float',\n",
+       "  'default': 0.1,\n",
+       "  'range': [0, 1]},\n",
+       " ('xgboost.XGBClassifier#1', 'gamma'): {'type': 'float',\n",
+       "  'default': 0,\n",
+       "  'range': [0, 1]},\n",
+       " ('xgboost.XGBClassifier#1', 'min_child_weight'): {'type': 'int',\n",
+       "  'default': 1,\n",
+       "  'range': [1, 10]}}"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tunables['single_table.classification.mlprimitives.xgboost']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And then create a `BTBSession` instance passing them and the `cross_validate` function.\n",
+    "\n",
+    "We will also be setting it in `verbose` mode, so we can have a better insight on what is going on."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from btb.session import BTBSession\n",
+    "\n",
+    "session = BTBSession(tunables, cross_validate, verbose=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Run the session"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After everything is set up, we can start running the tuning session passing it\n",
+    "the number of iterations that we want to perform."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fe9bb1cfdb2f48d4b6c8614ae1d357a1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-01-23 20:16:01,059 - INFO - session - Creating Tunable instance from dict.\n",
+      "2020-01-23 20:16:01,060 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.logit\n",
+      "2020-01-23 20:16:03,274 - INFO - session - New optimal found: single_table.classification.categorical_encoder.logit - 0.7975185708718643\n",
+      "2020-01-23 20:16:03,284 - INFO - session - Creating Tunable instance from dict.\n",
+      "2020-01-23 20:16:03,285 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.random_forest\n",
+      "2020-01-23 20:16:05,584 - INFO - session - Creating Tunable instance from dict.\n",
+      "2020-01-23 20:16:05,585 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.xgboost\n",
+      "2020-01-23 20:16:10,613 - INFO - session - New optimal found: single_table.classification.categorical_encoder.xgboost - 0.8639171383183359\n",
+      "2020-01-23 20:16:10,617 - INFO - session - Creating Tunable instance from dict.\n",
+      "2020-01-23 20:16:10,618 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.logit\n",
+      "2020-01-23 20:16:13,090 - INFO - session - Creating Tunable instance from dict.\n",
+      "2020-01-23 20:16:13,093 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.random_forest\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'id': '51a54054874dd7a83ff0e785ffdfee3b',\n",
+       " 'name': 'single_table.classification.categorical_encoder.xgboost',\n",
+       " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+       "   'max_labels'): 0,\n",
+       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
+       "  ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
+       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
+       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n",
+       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n",
+       " 'score': 0.8639171383183359}"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "session.run(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "During this loop, the BTBSession will build pipelines based on our templates and evaluate them\n",
+    "using our scoring function."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Evaluate results\n",
+    "\n",
+    "When the session funishes running it will return a the best proposal available and the\n",
+    "obtained score.\n",
+    "\n",
+    "These results are also available as the `best_proposal` attribute from the btb session object."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': '51a54054874dd7a83ff0e785ffdfee3b',\n",
+       " 'name': 'single_table.classification.categorical_encoder.xgboost',\n",
+       " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+       "   'max_labels'): 0,\n",
+       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
+       "  ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
+       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
+       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n",
+       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n",
+       " 'score': 0.8639171383183359}"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "session.best_proposal"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Continue Running"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If we feel that the score can still be improved and want to keep searching, we can simply run the session again which will continue tuning over the previous results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a76ce44e1173496e99baaf7ee39a3df7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-01-23 20:17:59,163 - INFO - session - Creating Tunable instance from dict.\n",
+      "2020-01-23 20:17:59,163 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.xgboost\n",
+      "2020-01-23 20:18:04,640 - INFO - session - Creating Tunable instance from dict.\n",
+      "2020-01-23 20:18:04,640 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives_text.xgboost\n",
+      "2020-01-23 20:18:04,779 - ERROR - mlpipeline - Exception caught producing MLBlock mlprimitives.custom.text.TextCleaner#1\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n",
+      "    return self._engine.get_loc(key)\n",
+      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "KeyError: 'text'\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 635, in _produce_block\n",
+      "    block_outputs = block.produce(**produce_args)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 322, in produce\n",
+      "    return getattr(self.instance, self.produce_method)(**produce_kwargs)\n",
+      "  File \"/home/xals/Projects/MIT/MLPrimitives/mlprimitives/custom/text.py\", line 111, in produce\n",
+      "    texts = X[self.column]\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n",
+      "    indexer = self.columns.get_loc(key)\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n",
+      "    return self._engine.get_loc(self._maybe_cast_indexer(key))\n",
+      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "KeyError: 'text'\n",
+      "2020-01-23 20:18:04,799 - ERROR - session - Proposal 7 - single_table.classification.mlprimitives_text.xgboost crashed with the following configuration: ('mlprimitives.custom.text.TextCleaner#1', 'lower'): True\n",
+      "('mlprimitives.custom.text.TextCleaner#1', 'accents'): True\n",
+      "('mlprimitives.custom.text.TextCleaner#1', 'stopwords'): True\n",
+      "('mlprimitives.custom.text.TextCleaner#1', 'non_alpha'): True\n",
+      "('mlprimitives.custom.text.TextCleaner#1', 'single_chars'): True\n",
+      "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'lowercase'): True\n",
+      "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'binary'): True\n",
+      "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'max_features'): 1000\n",
+      "('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'n_estimators'): 10\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'criterion'): gini\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'max_features'): None\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'max_depth'): 1\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_split'): 2\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_leaf'): 1\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'min_weight_fraction_leaf'): 0.0\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'max_leaf_nodes'): 2\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'min_impurity_decrease'): 0.0\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n",
+      "    return self._engine.get_loc(key)\n",
+      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "KeyError: 'text'\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n",
+      "    score = self.scorer(tunable_name, config)\n",
+      "  File \"<ipython-input-7-067b925bbee5>\", line 11, in cross_validate\n",
+      "    pipeline.fit(X_train, y_train)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 722, in fit\n",
+      "    self._produce_block(block, block_name, context, output_variables, outputs)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 635, in _produce_block\n",
+      "    block_outputs = block.produce(**produce_args)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 322, in produce\n",
+      "    return getattr(self.instance, self.produce_method)(**produce_kwargs)\n",
+      "  File \"/home/xals/Projects/MIT/MLPrimitives/mlprimitives/custom/text.py\", line 111, in produce\n",
+      "    texts = X[self.column]\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n",
+      "    indexer = self.columns.get_loc(key)\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n",
+      "    return self._engine.get_loc(self._maybe_cast_indexer(key))\n",
+      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "KeyError: 'text'\n",
+      "2020-01-23 20:18:04,801 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.mlprimitives_text.xgboost\n",
+      "2020-01-23 20:18:04,803 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
+      "2020-01-23 20:18:22,026 - INFO - session - New optimal found: single_table.classification.categorical_encoder.xgboost - 0.8687079630193402\n",
+      "2020-01-23 20:18:22,031 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
+      "2020-01-23 20:19:13,106 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.logit\n",
+      "2020-01-23 20:19:13,334 - ERROR - mlpipeline - Exception caught fitting MLBlock sklearn.linear_model.LogisticRegression#1\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
+      "    block.fit(**fit_args)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
+      "    getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
+      "    solver = _check_solver(self.solver, self.penalty, self.dual)\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
+      "    \"got %s penalty.\" % (solver, penalty))\n",
+      "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n",
+      "2020-01-23 20:19:13,339 - ERROR - session - Proposal 10 - single_table.classification.categorical_encoder.logit crashed with the following configuration: ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 29\n",
+      "('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): False\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 71156\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'solver'): newton-cg\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'penalty'): l1\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'C'): 40.699406362214916\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'multi_class'): multinomial\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 933.5409791334005\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.0017748534037681438\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'dual'): True\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n",
+      "    score = self.scorer(tunable_name, config)\n",
+      "  File \"<ipython-input-7-067b925bbee5>\", line 11, in cross_validate\n",
+      "    pipeline.fit(X_train, y_train)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 719, in fit\n",
+      "    self._fit_block(block, block_name, context)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
+      "    block.fit(**fit_args)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
+      "    getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
+      "    solver = _check_solver(self.solver, self.penalty, self.dual)\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
+      "    \"got %s penalty.\" % (solver, penalty))\n",
+      "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-01-23 20:19:13,340 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.categorical_encoder.logit\n",
+      "2020-01-23 20:19:13,343 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.logit\n",
+      "2020-01-23 20:19:26,076 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n",
+      "2020-01-23 20:19:31,573 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n",
+      "2020-01-23 20:19:34,763 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
+      "2020-01-23 20:20:15,775 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
+      "2020-01-23 20:21:49,655 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.logit\n",
+      "2020-01-23 20:21:49,946 - ERROR - mlpipeline - Exception caught fitting MLBlock sklearn.linear_model.LogisticRegression#1\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
+      "    block.fit(**fit_args)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
+      "    getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
+      "    solver = _check_solver(self.solver, self.penalty, self.dual)\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
+      "    \"got %s penalty.\" % (solver, penalty))\n",
+      "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n",
+      "2020-01-23 20:21:49,948 - ERROR - session - Proposal 16 - single_table.classification.mlprimitives.logit crashed with the following configuration: ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 97\n",
+      "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'lowercase'): True\n",
+      "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'binary'): True\n",
+      "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'max_features'): 4707\n",
+      "('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): True\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 26014\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'solver'): newton-cg\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'penalty'): l1\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'C'): 34.878827238511434\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'multi_class'): multinomial\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 406.1952335959628\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.008653762646621075\n",
+      "('sklearn.linear_model.LogisticRegression#1', 'dual'): True\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n",
+      "    score = self.scorer(tunable_name, config)\n",
+      "  File \"<ipython-input-7-067b925bbee5>\", line 11, in cross_validate\n",
+      "    pipeline.fit(X_train, y_train)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 719, in fit\n",
+      "    self._fit_block(block, block_name, context)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
+      "    block.fit(**fit_args)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
+      "    getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
+      "    solver = _check_solver(self.solver, self.penalty, self.dual)\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
+      "    \"got %s penalty.\" % (solver, penalty))\n",
+      "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n",
+      "2020-01-23 20:21:49,951 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.mlprimitives.logit\n",
+      "2020-01-23 20:21:49,953 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n",
+      "2020-01-23 20:22:23,153 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n",
+      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
+      "  warn(\"Some inputs do not have OOB scores. \"\n",
+      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
+      "  predictions[k].sum(axis=1)[:, np.newaxis])\n",
+      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
+      "  warn(\"Some inputs do not have OOB scores. \"\n",
+      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
+      "  predictions[k].sum(axis=1)[:, np.newaxis])\n",
+      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
+      "  warn(\"Some inputs do not have OOB scores. \"\n",
+      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
+      "  predictions[k].sum(axis=1)[:, np.newaxis])\n",
+      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
+      "  warn(\"Some inputs do not have OOB scores. \"\n",
+      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
+      "  predictions[k].sum(axis=1)[:, np.newaxis])\n",
+      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
+      "  warn(\"Some inputs do not have OOB scores. \"\n",
+      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
+      "  predictions[k].sum(axis=1)[:, np.newaxis])\n",
+      "2020-01-23 20:22:24,832 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
+      "2020-01-23 20:22:46,026 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
+      "2020-01-23 20:22:53,670 - INFO - session - New optimal found: single_table.classification.mlprimitives.xgboost - 0.8739290413691612\n",
+      "2020-01-23 20:22:53,677 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n",
+      "2020-01-23 20:22:55,126 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n",
+      "2020-01-23 20:23:10,345 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
+      "2020-01-23 20:23:15,497 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
+      "2020-01-23 20:23:28,746 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'id': 'd9854a57d48100da0f3584dc4490301f',\n",
+       " 'name': 'single_table.classification.mlprimitives.xgboost',\n",
+       " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+       "   'max_labels'): 22,\n",
+       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "   'lowercase'): True,\n",
+       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "   'binary'): True,\n",
+       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "   'max_features'): 3863,\n",
+       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 193,\n",
+       "  ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
+       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.29839198565184866,\n",
+       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.19826736959824165,\n",
+       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 4},\n",
+       " 'score': 0.8739290413691612}"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "session.run(20)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**NOTE**: If you look at the logs you will notice how the BTBSession captures the errors that finds\n",
+    "while executing the pipelines and automatically discards the failing tempaltes to be able to continue\n",
+    "the tuning session without wasting time on them.\n",
+    "\n",
+    "The number of errors that we want to wait before discarding a template can be changed passing the\n",
+    "`max_errors` argument to the `BTBSession` when it is build.\n",
+    "\n",
+    "Isn't it cool?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Build the best pipeline\n",
+    "\n",
+    "Once we are satisfied with the results, we can then build an instance of the best pipeline\n",
+    "by reading the `best_proposal` attribute from the `session`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': 'd9854a57d48100da0f3584dc4490301f',\n",
+       " 'name': 'single_table.classification.mlprimitives.xgboost',\n",
+       " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+       "   'max_labels'): 22,\n",
+       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "   'lowercase'): True,\n",
+       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "   'binary'): True,\n",
+       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "   'max_features'): 3863,\n",
+       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 193,\n",
+       "  ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
+       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.29839198565184866,\n",
+       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.19826736959824165,\n",
+       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 4},\n",
+       " 'score': 0.8739290413691612}"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "best_proposal = session.best_proposal\n",
+    "best_proposal"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template = templates_dict[best_proposal['name']]\n",
+    "\n",
+    "pipeline = MLPipeline(template.to_dict())\n",
+    "pipeline.set_hyperparameters(best_proposal['config'])\n",
+    "\n",
+    "pipeline.fit(dataset.data, dataset.target)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Explore other results\n",
+    "\n",
+    "Optionally, if we are interested in exploring the results of the previous proposals we can access them\n",
+    "in the `trials` attribute of the `session` object."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'id': '9dd9a11254f46b11ad42a12692b4965e',\n",
+       "  'name': 'single_table.classification.categorical_encoder.logit',\n",
+       "  'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+       "    'max_labels'): 0,\n",
+       "   ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+       "   ('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): True,\n",
+       "   ('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 100,\n",
+       "   ('sklearn.linear_model.LogisticRegression#1', 'solver'): 'liblinear',\n",
+       "   ('sklearn.linear_model.LogisticRegression#1', 'penalty'): 'l2',\n",
+       "   ('sklearn.linear_model.LogisticRegression#1', 'C'): 1.0,\n",
+       "   ('sklearn.linear_model.LogisticRegression#1', 'multi_class'): 'ovr',\n",
+       "   ('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 1.0,\n",
+       "   ('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.0001,\n",
+       "   ('sklearn.linear_model.LogisticRegression#1', 'dual'): False},\n",
+       "  'score': 0.7975185708718643},\n",
+       " {'id': 'f7ef0814341cee4f05280077b9b3de9c',\n",
+       "  'name': 'single_table.classification.categorical_encoder.random_forest',\n",
+       "  'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
+       "    'max_labels'): 0,\n",
+       "   ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+       "   ('sklearn.ensemble.RandomForestClassifier#1', 'n_estimators'): 10,\n",
+       "   ('sklearn.ensemble.RandomForestClassifier#1', 'criterion'): 'gini',\n",
+       "   ('sklearn.ensemble.RandomForestClassifier#1', 'max_features'): None,\n",
+       "   ('sklearn.ensemble.RandomForestClassifier#1', 'max_depth'): 1,\n",
+       "   ('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_split'): 2,\n",
+       "   ('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_leaf'): 1,\n",
+       "   ('sklearn.ensemble.RandomForestClassifier#1',\n",
+       "    'min_weight_fraction_leaf'): 0.0,\n",
+       "   ('sklearn.ensemble.RandomForestClassifier#1', 'max_leaf_nodes'): 2,\n",
+       "   ('sklearn.ensemble.RandomForestClassifier#1', 'min_impurity_decrease'): 0.0,\n",
+       "   ('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True,\n",
+       "   ('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False},\n",
+       "  'score': 0.7591904454179904}]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(session.proposals.values())[0:2]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/setup.py b/setup.py
index 1e8ef2ad..6045c574 100644
--- a/setup.py
+++ b/setup.py
@@ -18,6 +18,12 @@
 ]
 
 
+examples_require = [
+    'mlprimitives>=0.2.4.dev0',
+    'jupyter==1.0.0'
+]
+
+
 tests_require = [
     'pytest>=3.4.2',
     'pytest-cov>=2.6.0',
@@ -88,6 +94,7 @@
     extras_require={
         'dev': development_requires + tests_require,
         'test': tests_require,
+        'examples': examples_require,
     },
     include_package_data=True,
     install_requires=install_requires,

From c2f862b55ec52e6b7c431fe741bd83f7366b6a09 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sun, 16 Feb 2020 15:20:27 -0500
Subject: [PATCH 084/160] Update tutorials

---
 .gitignore                                    |   1 +
 examples/README.md                            |  57 ++
 .../2. Finding and Loading a Pipeline.ipynb   |   6 +-
 .... Setting MLPipeline Hyperparameters.ipynb |   9 +-
 ...Saving and Loading a Pipeline-Copy1.ipynb} |   9 +-
 ...ial execution and pipeline debugging.ipynb | 712 ++++++++++++++++++
 .../6. Flexible outputs specification.ipynb   | 517 +++++++++++++
 ...eline.ipynb => 7. Tuning a Pipeline.ipynb} |   0
 ...r the best pipeline with BTBSession.ipynb} |   0
 9 files changed, 1306 insertions(+), 5 deletions(-)
 create mode 100644 examples/README.md
 rename examples/tutorials/{4. Saving and Loading a Pipeline.ipynb => 4. Saving and Loading a Pipeline-Copy1.ipynb} (91%)
 create mode 100644 examples/tutorials/5. Partial execution and pipeline debugging.ipynb
 create mode 100644 examples/tutorials/6. Flexible outputs specification.ipynb
 rename examples/tutorials/{5. Tuning a Pipeline.ipynb => 7. Tuning a Pipeline.ipynb} (100%)
 rename examples/tutorials/{6. Searching for the best pipeline with BTBSession.ipynb => 8. Searching for the best pipeline with BTBSession.ipynb} (100%)

diff --git a/.gitignore b/.gitignore
index 011ff452..037d677e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -109,3 +109,4 @@ ENV/
 .*.swp
 
 mlblocks/data
+examples/tutorials/pipeline.pkl
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 00000000..12131c95
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,57 @@
+# MLBlocks Examples
+
+This folder contains Python code, Jupyter Notebooks and JSON examples to demonstrate MLBlocks
+functionaliry.
+
+Within this folder you will find:
+
+* `examples.py`: Simple Python code examples of a class and a function based primitive implementation.
+* `primitives`: Example primitive JSONs to demonstrate different MLBlocks functionalities.
+* `pipelines`: Example pipeline JSONs to demonstrate different MLBlocks functionalities.
+* `tutorials`: Collection of Jupyter Notebooks to show the usage of different MLBlocks functionalities.
+<!--* `problem_types`: Collection of Jupyter Notebooks that show example pipelines for multiple problem types.-->
+
+# Requirements
+
+In order to run the examples contained in this folder you should have [pip installed on your system
+](https://pip.pypa.io/en/stable/installing/).
+
+Optionally, also install and activate a [virtualenv](https://virtualenv.pypa.io/en/latest/) to
+run them in an isolated environment.
+
+# Usage
+
+In order to run these tutorials on your computer, please follow these steps:
+
+1. Clone this github repository:
+
+```bash
+git clone git@github.com:HDI-Project/MLBlocks.git
+```
+
+2. (Optional) Create a virtualenv to execute the examples in an environment isolated from the
+rest of your computer:
+
+```bash
+pip install virtualenv
+virtualenv -p $(which python3.6) mlblocks-venv
+soucre mlblocks-venv/bin/activate
+```
+
+3. Enter the repository and install the dependencies
+
+```bash
+cd MLBlocks
+make install-examples
+```
+
+This will install [MLBLocks](https://github.com/HDI-Project/MLBlocks.git) and also [MLPrimitives](
+https://github.com/HDI-Project/MLPrimitives.git) and [Jupyter](https://jupyter.org/).
+
+4. Enter the `examples` folder and start a Jupyter Notebook:
+
+```bash
+jupyter notebook
+```
+
+5. Point your browser at the link shown in your console and run the examples from the `examples/tutorials` folder.
diff --git a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb
index a94c48bc..8df76259 100644
--- a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb	
+++ b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb	
@@ -52,7 +52,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -63,7 +63,7 @@
        " 'image.classification.resnet50.xgboost']"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -89,7 +89,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
index 29f60a8f..0914e806 100644
--- a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb	
+++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb	
@@ -51,7 +51,14 @@
    "source": [
     "from mlblocks import MLPipeline\n",
     "\n",
-    "pipeline = MLPipeline('single_table.classification.categorical_encoder.xgboost')"
+    "primitives = [\n",
+    "    'mlprimitives.custom.preprocessing.ClassEncoder',\n",
+    "    'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
+    "    'sklearn.impute.SimpleImputer',\n",
+    "    'xgboost.XGBClassifier',\n",
+    "    'mlprimitives.custom.preprocessing.ClassDecoder'\n",
+    "]\n",
+    "pipeline = MLPipeline(primitives)"
    ]
   },
   {
diff --git a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb b/examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb
similarity index 91%
rename from examples/tutorials/4. Saving and Loading a Pipeline.ipynb
rename to examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb
index 193daaf3..f8a0a5b3 100644
--- a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb	
+++ b/examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb	
@@ -57,7 +57,14 @@
    "source": [
     "from mlblocks import MLPipeline\n",
     "\n",
-    "pipeline = MLPipeline('single_table.classification.categorical_encoder.xgboost')"
+    "primitives = [\n",
+    "    'mlprimitives.custom.preprocessing.ClassEncoder',\n",
+    "    'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
+    "    'sklearn.impute.SimpleImputer',\n",
+    "    'xgboost.XGBClassifier',\n",
+    "    'mlprimitives.custom.preprocessing.ClassDecoder'\n",
+    "]\n",
+    "pipeline = MLPipeline(primitives)"
    ]
   },
   {
diff --git a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb
new file mode 100644
index 00000000..2e21c85b
--- /dev/null
+++ b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb	
@@ -0,0 +1,712 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Partial execution and pipeline debugging\n",
+    "\n",
+    "In this guide we will show you how to execute a pipeline partially in order to\n",
+    "debug its internal behavior or optimize tuning processes.\n",
+    "\n",
+    "Note that some steps are not explained for simplicity. Full details\n",
+    "about them can be found in the previous parts of the tutorial.\n",
+    "\n",
+    "We will:\n",
+    "\n",
+    "1. Load a pipeline and a dataset\n",
+    "2. Explore the context after fitting the first primitive.\n",
+    "3. Fit the rest of the pipeline\n",
+    "4. Partial execution during Predict\n",
+    "5. Rerunning the last steps"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load a pipeline and a datset\n",
+    "\n",
+    "The first step will be to load the Census dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlprimitives.datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset('census')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train, X_test, y_train, y_test = dataset.get_splits(1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As a reminder, we have a loot at what the `X` and `y` variables that we will be passing to our\n",
+    "pipeline look like."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`X` is a `pandas.DataFrame` that conatins the demographics data of the subjects:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>age</th>\n",
+       "      <th>workclass</th>\n",
+       "      <th>fnlwgt</th>\n",
+       "      <th>education</th>\n",
+       "      <th>education-num</th>\n",
+       "      <th>marital-status</th>\n",
+       "      <th>occupation</th>\n",
+       "      <th>relationship</th>\n",
+       "      <th>race</th>\n",
+       "      <th>sex</th>\n",
+       "      <th>capital-gain</th>\n",
+       "      <th>capital-loss</th>\n",
+       "      <th>hours-per-week</th>\n",
+       "      <th>native-country</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>28291</th>\n",
+       "      <td>25</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>193379</td>\n",
+       "      <td>Assoc-acdm</td>\n",
+       "      <td>12</td>\n",
+       "      <td>Never-married</td>\n",
+       "      <td>Craft-repair</td>\n",
+       "      <td>Not-in-family</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>45</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28636</th>\n",
+       "      <td>55</td>\n",
+       "      <td>Federal-gov</td>\n",
+       "      <td>176904</td>\n",
+       "      <td>HS-grad</td>\n",
+       "      <td>9</td>\n",
+       "      <td>Married-civ-spouse</td>\n",
+       "      <td>Exec-managerial</td>\n",
+       "      <td>Husband</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>40</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7919</th>\n",
+       "      <td>30</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>284395</td>\n",
+       "      <td>HS-grad</td>\n",
+       "      <td>9</td>\n",
+       "      <td>Married-civ-spouse</td>\n",
+       "      <td>Craft-repair</td>\n",
+       "      <td>Husband</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>50</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24861</th>\n",
+       "      <td>17</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>239346</td>\n",
+       "      <td>10th</td>\n",
+       "      <td>6</td>\n",
+       "      <td>Never-married</td>\n",
+       "      <td>Other-service</td>\n",
+       "      <td>Own-child</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>18</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23480</th>\n",
+       "      <td>51</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>57698</td>\n",
+       "      <td>HS-grad</td>\n",
+       "      <td>9</td>\n",
+       "      <td>Married-spouse-absent</td>\n",
+       "      <td>Other-service</td>\n",
+       "      <td>Unmarried</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>40</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       age     workclass  fnlwgt    education  education-num  \\\n",
+       "28291   25       Private  193379   Assoc-acdm             12   \n",
+       "28636   55   Federal-gov  176904      HS-grad              9   \n",
+       "7919    30       Private  284395      HS-grad              9   \n",
+       "24861   17       Private  239346         10th              6   \n",
+       "23480   51       Private   57698      HS-grad              9   \n",
+       "\n",
+       "               marital-status        occupation    relationship    race  \\\n",
+       "28291           Never-married      Craft-repair   Not-in-family   White   \n",
+       "28636      Married-civ-spouse   Exec-managerial         Husband   White   \n",
+       "7919       Married-civ-spouse      Craft-repair         Husband   White   \n",
+       "24861           Never-married     Other-service       Own-child   White   \n",
+       "23480   Married-spouse-absent     Other-service       Unmarried   White   \n",
+       "\n",
+       "           sex  capital-gain  capital-loss  hours-per-week  native-country  \n",
+       "28291     Male             0             0              45   United-States  \n",
+       "28636     Male             0             0              40   United-States  \n",
+       "7919      Male             0             0              50   United-States  \n",
+       "24861     Male             0             0              18   United-States  \n",
+       "23480   Female             0             0              40   United-States  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And `y` is a `numpy.ndarray` that contains the label that indicates whether the subject has a salary\n",
+    "above or under 50K."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K'], dtype=object)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y_train[0:5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And we build a suitable pipeline for our dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlblocks import MLPipeline\n",
+    "\n",
+    "primitives = [\n",
+    "    'mlprimitives.custom.preprocessing.ClassEncoder',\n",
+    "    'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
+    "    'sklearn.impute.SimpleImputer',\n",
+    "    'xgboost.XGBClassifier',\n",
+    "    'mlprimitives.custom.preprocessing.ClassDecoder'\n",
+    "]\n",
+    "pipeline = MLPipeline(primitives)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Explore the context after fitting the first primitive\n",
+    "\n",
+    "Once we know what primitives we are executing, we will execute only the first one\n",
+    "and see how the context changed after it.\n",
+    "\n",
+    "For this, we will execute the `fit` method passing the index of the last pipeline\n",
+    "step that we want to execute before returning. In this case, `0`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fit_context = pipeline.fit(X_train, y_train, output_=0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**NOTE**: Optionally, instead of passing the pipeline step index, we could pass the complete name\n",
+    "of the step, including the counter number: `mlprimitives.custom.preprocessing.ClassEncoder#1`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_step = 'mlprimitives.custom.preprocessing.ClassEncoder#1'\n",
+    "fit_context = pipeline.fit(X_train, y_train, output_=output_step)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In both cases, the output will be a dictionary containing all the context variables after\n",
+    "fitting and producing the first pipeline step."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['X', 'y', 'classes'])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fit_context.keys()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Notice how we find the `X` and `y` variables that we passed to the `fit` method, but also a new `classes` variable\n",
+    "that was generated by the `mlprimitives.custom.preprocessing.ClassEncoder` primitive of the first pipeline step.\n",
+    "\n",
+    "This `classes` variable contains the list of unique values that the variable `y` originally had."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([' <=50K', ' >50K'], dtype=object)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fit_context['classes']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Also notice that the variable `y` has been transformed by the primitive into an array of\n",
+    "integer values."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0, 0, 0, 0, 0])"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fit_context['y'][0:5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fit the rest of the pipeline\n",
+    "\n",
+    "After exploring the context generated by the first pipeline step we will now run\n",
+    "a few steps more, up to the point where the feature matrix is ready for the XGBClassifier.\n",
+    "\n",
+    "For this we will run the `fit` method again passing back the context that we just obtained\n",
+    "as well as the `start_` argument indicating that we need to start fitting on the second\n",
+    "step of the pipeline, skipping the first one, and the `output_` argument indicating that\n",
+    "we want to stop on the third step, right before the `XGBClassifier` primitive.\n",
+    "\n",
+    "Note how the context is passed using a double asterisk `**` syntax, but that individual\n",
+    "variables could also be passed as keyword arguments."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fit_context = pipeline.fit(start_=1, output_=2, **fit_context)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now the context still contains the same variables as before"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['classes', 'X', 'y'])"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fit_context.keys()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "But the variable `X` has been completely modified by the CategoricalEncoder and Imputer\n",
+    "primitives, so now it is a 100% numerical `numpy.ndarray` ready for the `XGBClassifier`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([2.50000e+01, 1.93379e+05, 1.20000e+01, 0.00000e+00, 0.00000e+00,\n",
+       "       4.50000e+01, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,\n",
+       "       0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,\n",
+       "       0.00000e+00, 0.00000e+00, 0.00000e+00])"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fit_context['X'][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we can pass the new context to the rest of the pipeline to finish fitting it.\n",
+    "\n",
+    "Note how, just like the `output_`, the `start_` step can also be indicated using the step\n",
+    "name instead of the index."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline.fit(start_='xgboost.XGBClassifier#1', **fit_context)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Partial execution during Predict\n",
+    "\n",
+    "Just like in the `fit` stage, the `predict` method also accepts a partial output specification."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predict_context = pipeline.predict(X_test, output_=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['X', 'y'])"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "predict_context.keys()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As well as a partial execution after a specific pipeline step"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions = pipeline.predict(start_=3, **predict_context)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([' >50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "predictions[0:5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Rerunning the last steps\n",
+    "\n",
+    "One of the key advantages of the partial execution that we just explored is the\n",
+    "possibility to re-fit and make new predictions multiple times with different\n",
+    "hyperparameter values for the last half of the pipeline without the need to\n",
+    "re-fit and re-execute the first half.\n",
+    "\n",
+    "This has the potential to greatly accelerate tuning processes in cases where there\n",
+    "are no tunable hyperparameters (or there are but we do not want to tune them) in\n",
+    "the preprocessing steps but the execution times are long.\n",
+    "\n",
+    "As an example, let's evaluate the performance of the pipeline and try to optimize\n",
+    "it by changing some hyperparameters of the classifier."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.8602137329566393"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset.score(y_test, predictions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hyperparameters = {\n",
+    "    'xgboost.XGBClassifier#1': {\n",
+    "        'learning_rate': 0.5\n",
+    "    }\n",
+    "}\n",
+    "pipeline.set_hyperparameters(hyperparameters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline.fit(start_=3, **fit_context)\n",
+    "predictions = pipeline.predict(start_=3, **predict_context)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.872251566146665"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset.score(y_test, predictions)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tutorials/6. Flexible outputs specification.ipynb b/examples/tutorials/6. Flexible outputs specification.ipynb
new file mode 100644
index 00000000..3dc3686f
--- /dev/null
+++ b/examples/tutorials/6. Flexible outputs specification.ipynb	
@@ -0,0 +1,517 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Flexible outputs specification\n",
+    "\n",
+    "In a previous tutorial we have learnt how to obtain intermediate pipeline\n",
+    "outputs in order to debug its internal behavior.\n",
+    "\n",
+    "In this guide we will go a bit further and learn how to define flexible outputs\n",
+    "for the pipeline in order to obtain the output of multiple primitives\n",
+    "at once.\n",
+    "\n",
+    "Note that some steps are not explained for simplicity. Full details\n",
+    "about them can be found in the previous parts of the tutorial.\n",
+    "\n",
+    "We will:\n",
+    "\n",
+    "1. Load a pipeline and a dataset\n",
+    "2. Explore the output specification formats"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load a pipeline and a datset\n",
+    "\n",
+    "The first step will be to load the Census dataset and the pipeline that we will be using."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlprimitives.datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset('census')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train, X_test, y_train, y_test = dataset.get_splits(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlblocks import MLPipeline\n",
+    "\n",
+    "primitives = [\n",
+    "    'mlprimitives.custom.preprocessing.ClassEncoder',\n",
+    "    'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
+    "    'sklearn.impute.SimpleImputer',\n",
+    "    'xgboost.XGBClassifier',\n",
+    "    'mlprimitives.custom.preprocessing.ClassDecoder'\n",
+    "]\n",
+    "pipeline = MLPipeline(primitives)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Also, just as a reminder, let's have a quick look at the steps of this pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['mlprimitives.custom.preprocessing.ClassEncoder',\n",
+       " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
+       " 'sklearn.impute.SimpleImputer',\n",
+       " 'xgboost.XGBClassifier',\n",
+       " 'mlprimitives.custom.preprocessing.ClassDecoder']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipeline.primitives"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And at the `X` and `y` variables that we will be passing to our pipeline."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`X` is a `pandas.DataFrame` that conatins the demographics data of the subjects:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>age</th>\n",
+       "      <th>workclass</th>\n",
+       "      <th>fnlwgt</th>\n",
+       "      <th>education</th>\n",
+       "      <th>education-num</th>\n",
+       "      <th>marital-status</th>\n",
+       "      <th>occupation</th>\n",
+       "      <th>relationship</th>\n",
+       "      <th>race</th>\n",
+       "      <th>sex</th>\n",
+       "      <th>capital-gain</th>\n",
+       "      <th>capital-loss</th>\n",
+       "      <th>hours-per-week</th>\n",
+       "      <th>native-country</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>28291</th>\n",
+       "      <td>25</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>193379</td>\n",
+       "      <td>Assoc-acdm</td>\n",
+       "      <td>12</td>\n",
+       "      <td>Never-married</td>\n",
+       "      <td>Craft-repair</td>\n",
+       "      <td>Not-in-family</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>45</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28636</th>\n",
+       "      <td>55</td>\n",
+       "      <td>Federal-gov</td>\n",
+       "      <td>176904</td>\n",
+       "      <td>HS-grad</td>\n",
+       "      <td>9</td>\n",
+       "      <td>Married-civ-spouse</td>\n",
+       "      <td>Exec-managerial</td>\n",
+       "      <td>Husband</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>40</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7919</th>\n",
+       "      <td>30</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>284395</td>\n",
+       "      <td>HS-grad</td>\n",
+       "      <td>9</td>\n",
+       "      <td>Married-civ-spouse</td>\n",
+       "      <td>Craft-repair</td>\n",
+       "      <td>Husband</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>50</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24861</th>\n",
+       "      <td>17</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>239346</td>\n",
+       "      <td>10th</td>\n",
+       "      <td>6</td>\n",
+       "      <td>Never-married</td>\n",
+       "      <td>Other-service</td>\n",
+       "      <td>Own-child</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>18</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23480</th>\n",
+       "      <td>51</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>57698</td>\n",
+       "      <td>HS-grad</td>\n",
+       "      <td>9</td>\n",
+       "      <td>Married-spouse-absent</td>\n",
+       "      <td>Other-service</td>\n",
+       "      <td>Unmarried</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>40</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       age     workclass  fnlwgt    education  education-num  \\\n",
+       "28291   25       Private  193379   Assoc-acdm             12   \n",
+       "28636   55   Federal-gov  176904      HS-grad              9   \n",
+       "7919    30       Private  284395      HS-grad              9   \n",
+       "24861   17       Private  239346         10th              6   \n",
+       "23480   51       Private   57698      HS-grad              9   \n",
+       "\n",
+       "               marital-status        occupation    relationship    race  \\\n",
+       "28291           Never-married      Craft-repair   Not-in-family   White   \n",
+       "28636      Married-civ-spouse   Exec-managerial         Husband   White   \n",
+       "7919       Married-civ-spouse      Craft-repair         Husband   White   \n",
+       "24861           Never-married     Other-service       Own-child   White   \n",
+       "23480   Married-spouse-absent     Other-service       Unmarried   White   \n",
+       "\n",
+       "           sex  capital-gain  capital-loss  hours-per-week  native-country  \n",
+       "28291     Male             0             0              45   United-States  \n",
+       "28636     Male             0             0              40   United-States  \n",
+       "7919      Male             0             0              50   United-States  \n",
+       "24861     Male             0             0              18   United-States  \n",
+       "23480   Female             0             0              40   United-States  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And `y` is a `numpy.ndarray` that contains the label that indicates whether the subject has a salary\n",
+    "above or under 50K."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K'], dtype=object)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y_train[0:5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Explore the output specification formats\n",
+    "\n",
+    "In the previous tutorial we learnt that the output of a pipeline can be specified\n",
+    "in multiple formats:\n",
+    "\n",
+    "* An integer indicating the pipeline step index, which will return us the complete\n",
+    "  context after producing the corresponding step.\n",
+    "* A string indicating the name of a step, which will also return us the complete\n",
+    "  context after producing the corresponding step.\n",
+    "  \n",
+    "A part from these two options, there are a few more.\n",
+    "\n",
+    "### Single variable specification\n",
+    "\n",
+    "Variables can be individually specified by passing a string in the format\n",
+    "`{pipeline-step-name}.{variable-name}`.\n",
+    "\n",
+    "Note that the `pipeline-step-name` part is not only the primitive name, but\n",
+    "also the counter number at the end of it.\n",
+    "\n",
+    "For example, if we want to explore the `classes` variable generated by\n",
+    "the `ClassEncoder` primitive during `fit`, we can do the following:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([' <=50K', ' >50K'], dtype=object)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "output_spec = 'mlprimitives.custom.preprocessing.ClassEncoder#1.classes'\n",
+    "pipeline.fit(X_train, y_train, output_=output_spec)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**NOTE**: Just like with the full context specification, when a variable is specified\n",
+    "the pipeline will be executed only up to the step that produces the indicated variable."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### List of variables\n",
+    "\n",
+    "In some cases we will be interested in obtaining more than one variable\n",
+    "at a time.\n",
+    "\n",
+    "In order to do this, instead of a single string specification we can pass\n",
+    "a list of strings."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_spec = [\n",
+    "    'mlprimitives.custom.preprocessing.ClassEncoder#1.y',\n",
+    "    'mlprimitives.custom.preprocessing.ClassEncoder#1.classes',\n",
+    "]\n",
+    "out = pipeline.fit(X_train, y_train, output_=output_spec)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The output will be a `tuple` containing the variables in the specified order."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y, classes = out"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If we want to obtain variables from multiple pipeline steps we simply need\n",
+    "to specify all of them at once. Again, **MLBlocks** will run all the necessary\n",
+    "pipeline steps, accumulating the desired variables up to the last step needed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_spec = [\n",
+    "    'sklearn.impute.SimpleImputer#1.X',\n",
+    "    'mlprimitives.custom.preprocessing.ClassEncoder#1.y',\n",
+    "    'mlprimitives.custom.preprocessing.ClassEncoder#1.classes',\n",
+    "]\n",
+    "X, y, classes = pipeline.fit(X_train, y_train, output_=output_spec)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If required, we can even capture the same variable along the different pipeline steps!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_spec = [\n",
+    "    'mlprimitives.custom.feature_extraction.CategoricalEncoder#1.X',\n",
+    "    'sklearn.impute.SimpleImputer#1.X',\n",
+    "    'mlprimitives.custom.preprocessing.ClassEncoder#1.y',\n",
+    "    'mlprimitives.custom.preprocessing.ClassEncoder#1.classes',\n",
+    "]\n",
+    "X_1, X_2, y, classes = pipeline.fit(X_train, y_train, output_=output_spec)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(24420, 108)"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_1.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(24420, 108)"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_2.shape"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tutorials/5. Tuning a Pipeline.ipynb b/examples/tutorials/7. Tuning a Pipeline.ipynb
similarity index 100%
rename from examples/tutorials/5. Tuning a Pipeline.ipynb
rename to examples/tutorials/7. Tuning a Pipeline.ipynb
diff --git a/examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
similarity index 100%
rename from examples/tutorials/6. Searching for the best pipeline with BTBSession.ipynb
rename to examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb

From 03c7a2d07d15f6e69e448e72860fc4b18ad60ac9 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sun, 16 Feb 2020 17:30:57 -0500
Subject: [PATCH 085/160] Simplify README and make docs based on it

---
 README.md                           | 190 +++++-----------------------
 docs/conf.py                        |   4 +-
 docs/getting_started/install.rst    |  57 ---------
 docs/getting_started/quickstart.rst | 125 ------------------
 docs/index.rst                      |  54 +-------
 docs/readme.rst                     |   1 +
 6 files changed, 37 insertions(+), 394 deletions(-)
 delete mode 100644 docs/getting_started/install.rst
 delete mode 100644 docs/getting_started/quickstart.rst
 create mode 100644 docs/readme.rst

diff --git a/README.md b/README.md
index 7c152fa3..f3a6e3d7 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <p align="left">
-<img width=15% src="https://dai.lids.mit.edu/wp-content/uploads/2018/06/Logo_DAI_highres.png" alt=“MLBlocksr” />
+<img width=15% src="https://dai.lids.mit.edu/wp-content/uploads/2018/06/Logo_DAI_highres.png" alt=“DAI-Lab” />
 <i>An open source project from Data to AI Lab at MIT.</i>
 </p>
 
@@ -16,12 +16,12 @@ Pipelines and Primitives for Machine Learning and Data Science.
 [![CodeCov](https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/HDI-Project/MLBlocks)
 [![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks)
 
+# Overview
+
 * Free software: MIT license
 * Documentation: https://HDI-Project.github.io/MLBlocks
 * Homepage: https://github.com/HDI-Project/MLBlocks
 
-# MLBlocks
-
 MLBlocks is a simple framework for composing end-to-end tunable Machine Learning Pipelines by
 seamlessly combining tools from any python library with a simple, common and uniform interface.
 
@@ -47,25 +47,10 @@ Also, although it is not strictly required, the usage of a
 [virtualenv](https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid
 interfering with other software installed in the system where **MLBlocks** is run.
 
-These are the minimum commands needed to create a virtualenv using python3.6 for **MLBlocks**:
-
-```bash
-pip install virtualenv
-virtualenv -p $(which python3.6) mlblocks-venv
-```
-
-Afterwards, you have to execute this command to have the virtualenv activated:
-
-```bash
-source mlblocks-venv/bin/activate
-```
-
-Remember about executing it every time you start a new console to work on **MLBlocks**!
-
 ## Install with pip
 
-After creating the virtualenv and activating it, we recommend using
-[pip](https://pip.pypa.io/en/stable/) in order to install **MLBlocks**:
+The easiest and recommended way to install **MLBlocks** is using [pip](
+https://pip.pypa.io/en/stable/):
 
 ```bash
 pip install mlblocks
@@ -73,46 +58,8 @@ pip install mlblocks
 
 This will pull and install the latest stable release from [PyPi](https://pypi.org/).
 
-## Install from source
-
-Alternatively, with your virtualenv activated, you can clone the repository and install it from
-source by running `make install` on the `stable` branch:
-
-```bash
-git clone git@github.com:HDI-Project/MLBlocks.git
-cd MLBlocks
-git checkout stable
-make install
-```
-
-## Install for Development
-
-If you want to contribute to the project, a few more steps are required to make the project ready
-for development.
-
-First, please head to [the GitHub page of the project](https://github.com/HDI-Project/MLBlocks)
-and make a fork of the project under you own username by clicking on the **fork** button on the
-upper right corner of the page.
-
-Afterwards, clone your fork and create a branch from master with a descriptive name that includes
-the number of the issue that you are going to work on:
-
-```bash
-git clone git@github.com:{your username}/MLBlocks.git
-cd MLBlocks
-git branch issue-xx-cool-new-feature master
-git checkout issue-xx-cool-new-feature
-```
-
-Finally, install the project with the following command, which will install some additional
-dependencies for code linting and testing.
-
-```bash
-make install-develop
-```
-
-Make sure to use them regularly while developing by running the commands `make lint` and `make test`.
-
+If you want to install from source or contribute to the project please read the
+[Contributing Guide](https://hdi-project.github.io/MLBlocks/contributing.html#get-started).
 
 ## MLPrimitives
 
@@ -128,118 +75,43 @@ pip install mlprimitives
 
 # Quickstart
 
-Below there is a short example about how to use MLBlocks to create a simple pipeline, fit it
-using demo data and use it to make predictions.
+Below there is a short example about how to use **MLBlocks** to solve a prediction problem
+using the primitives and pipelines from [MLPrimitives](https://github.com/HDI-Project/MLPrimitives).
 
-Please make sure to also having installed [MLPrimitives](https://github.com/HDI-Project/MLPrimitives)
-before following it.
+```python3
+from mlblocks import MLPipeline
+from mlprimitives.datasets import load_dataset
 
-For advance usage and more detailed explanation about each component, please have a look
-at the [documentation](https://HDI-Project.github.io/MLBlocks)
+dataset = load_dataset('census')
+X_train, X_test, y_train, y_test = dataset.get_splits(1)
 
-## Creating a pipeline
+primitives = [
+    'mlprimitives.custom.preprocessing.ClassEncoder',
+    'mlprimitives.custom.feature_extraction.CategoricalEncoder',
+    'sklearn.impute.SimpleImputer',
+    'xgboost.XGBClassifier',
+    'mlprimitives.custom.preprocessing.ClassDecoder'
+]
+pipeline = MLPipeline(primitives)
 
-With MLBlocks, creating a pipeline is as simple as specifying a list of primitives and passing
-them to the `MLPipeline` class.
+pipeline.fit(X_train, y_train)
+predictions = pipeline.predict(X_test)
 
-```python
->>> from mlblocks import MLPipeline
-... primitives = [
-...     'cv2.GaussianBlur',
-...     'skimage.feature.hog',
-...     'sklearn.ensemble.RandomForestClassifier'
-... ]
->>> pipeline = MLPipeline(primitives)
-```
-
-Optionally, specific initialization arguments can be also set by specifying them in a dictionary:
-
-```python
->>> init_params = {
-...    'skimage.feature.hog': {
-...        'multichannel': True,
-...        'visualize': False
-...    },
-...    'sklearn.ensemble.RandomForestClassifier': {
-...         'n_estimators': 100,
-...    }
-... }
->>> pipeline = MLPipeline(primitives, init_params=init_params)
-```
-
-If you can see which hyperparameters a particular pipeline is using, you can do so by calling
-its `get_hyperparameters` method:
-
-```python
->>> import json
->>> hyperparameters = pipeline.get_hyperparameters()
->>> print(json.dumps(hyperparameters, indent=4))
-{
-    "cv2.GaussianBlur#1": {
-        "ksize_width": 3,
-        "ksize_height": 3,
-        "sigma_x": 0,
-        "sigma_y": 0
-    },
-    "skimage.feature.hog#1": {
-        "multichannel": true,
-        "visualize": false,
-        "orientations": 9,
-        "pixels_per_cell_x": 8,
-        "pixels_per_cell_y": 8,
-        "cells_per_block_x": 3,
-        "cells_per_block_y": 3,
-        "block_norm": null
-    },
-    "sklearn.ensemble.RandomForestClassifier#1": {
-        "n_jobs": -1,
-        "n_estimators": 100,
-        "criterion": "entropy",
-        "max_features": null,
-        "max_depth": 10,
-        "min_samples_split": 0.1,
-        "min_samples_leaf": 0.1,
-        "class_weight": null
-    }
-}
-```
-
-## Making predictions
-
-Once we have created the pipeline with the desired hyperparameters we can fit it
-and then use it to make predictions on new data.
-
-To do this, we first call the `fit` method passing the training data and the corresponding labels.
-
-In this case in particular, we will be loading the handwritten digit classification dataset
-from USPS using the `mlblocks.datasets.load_usps` method, which returns a dataset object
-ready to be played with.
-
-```python
->>> from mlblocks.datasets import load_usps
->>> dataset = load_usps()
->>> X_train, X_test, y_train, y_test = dataset.get_splits(1)
->>> pipeline.fit(X_train, y_train)
-```
-
-Once we have fitted our model to our data, we can call the `predict` method passing new data
-to obtain predictions from the pipeline.
-
-```python
->>> predictions = pipeline.predict(X_test)
->>> predictions
-array([3, 2, 1, ..., 1, 1, 2])
+dataset.score(y_test, predictions)
 ```
 
 # What's Next?
 
 If you want to learn more about how to tune the pipeline hyperparameters, save and load
 the pipelines using JSON annotations or build complex multi-branched pipelines, please
-check our [documentation](https://HDI-Project.github.io/MLBlocks).
+check our [documentation site](https://HDI-Project.github.io/MLBlocks).
+
+Also do not forget to have a look at the [notebook tutorials](
+https://github.com/D3-AI/GreenGuard/tree/master/examples/tutorials)!
 
-## Citing MLBlocks
+# Citing MLBlocks
 
-If you use MLBlocks, please consider citing our related papers.
+If you use MLBlocks for your research, please consider citing our related papers.
 
 For the current design of MLBlocks and its usage within the larger *Machine Learning Bazaar* project at
 the MIT Data To AI Lab, please see:
diff --git a/docs/conf.py b/docs/conf.py
index 95653914..5ff266d0 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -27,7 +27,6 @@
 # If your documentation needs a minimal Sphinx version, state it here.
 #
 # needs_sphinx = '1.0'
-
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 extensions = [
@@ -47,7 +46,6 @@
 }
 
 ipython_execlines = ["import pandas as pd", "pd.set_option('display.width', 1000000)"]
-
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 
@@ -116,7 +114,7 @@
 # documentation.
 html_theme_options = {
     'collapse_navigation': False,
-    'display_version': False,
+    'display_version': True,
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,
diff --git a/docs/getting_started/install.rst b/docs/getting_started/install.rst
deleted file mode 100644
index 4163f3bd..00000000
--- a/docs/getting_started/install.rst
+++ /dev/null
@@ -1,57 +0,0 @@
-.. highlight:: shell
-
-Installation
-============
-
-From PyPi
----------
-
-The simplest and recommended way to install MLBlocks is using `pip`:
-
-.. code-block:: console
-
-    pip install mlblocks
-
-If you don't have `pip`_ installed, this `Python installation guide`_ can guide
-you through the process.
-
-.. _pip: https://pip.pypa.io
-.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
-
-From sources
-------------
-
-The sources for MLBlocks can be downloaded from the `Github repo`_.
-
-You can either clone the public repository:
-
-.. code-block:: console
-
-    git clone git://github.com/HDI-Project/MLBlocks
-
-Or download the `tarball`_:
-
-.. code-block:: console
-
-    curl  -OL https://github.com/HDI-Project/MLBlocks/tarball/master
-
-Once you have a copy of the source, you can install it running the next command inside the
-project folder:
-
-.. code-block:: console
-
-    $ make install
-
-.. _Github repo: https://github.com/HDI-Project/MLBlocks
-.. _tarball: https://github.com/HDI-Project/MLBlocks/tarball/master
-
-Development
------------
-
-If you are installing **MLBlocks** in order to modify its code, the installation must be done
-from its sources, in the editable mode, and also including some additional dependencies in
-order to be able to run the tests and build the documentation:
-
-.. code-block:: console
-
-    make install-develop
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
deleted file mode 100644
index 31be89ee..00000000
--- a/docs/getting_started/quickstart.rst
+++ /dev/null
@@ -1,125 +0,0 @@
-Quickstart
-==========
-
-Below is a short tutorial that will show you how to get started using **MLBlocks**.
-
-In this tutorial we will learn how to:
-
-* Create a pipeline using multiple primitives
-* Obtain the list of tunable hyperparameters from the pipeline
-* Specify hyperparameters for each primitive in the pipeline
-* Fit the pipeline using training data
-* Use the pipeline to make predictions from new data
-
-.. note:: Some additional dependencies are required in order to run this Quickstart.
-          Make sure that `you have already installed them`_.
-
-Creating a pipeline
--------------------
-
-With MLBlocks, creating a pipeline is as simple as specifying a list of primitives and passing
-them to the `MLPipeline class`_:
-
-.. ipython:: python
-
-    from mlblocks import MLPipeline
-    primitives = [
-        'mlprimitives.custom.feature_extraction.CategoricalEncoder',
-        'mlprimitives.custom.feature_extraction.StringVectorizer',
-        'sklearn.ensemble.RandomForestClassifier',
-    ]
-    pipeline = MLPipeline(primitives)
-
-Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary and
-passing them as the ``init_params`` argument:
-
-.. ipython:: python
-
-    init_params = {
-        'sklearn.ensemble.RandomForestClassifier': {
-            'n_estimators': 100
-        }
-    }
-    pipeline = MLPipeline(primitives, init_params=init_params)
-
-Once the pipeline has been instantiated, we can easily see what `hyperparameters`_ have been set
-for each block, by calling the `get_hyperparameters method`_.
-
-The output of this method is a dictionary which has the name of each block as keys and
-a dictionary with the `hyperparameters`_ of the corresponding block as values.
-
-.. ipython:: python
-
-    pipeline.get_hyperparameters()
-
-Tunable Hyperparameters
------------------------
-
-One of the main features of `MLBlocks JSON Annotations`_ is the possibility to indicate
-the type and possible values that each primitive hyperparameter accepts.
-
-The list of possible hyperparameters and their details can easily be obtained from the pipeline
-instance by calling its `get_tunable_hyperparameters method`_.
-
-The output of this method is a dictionary that contains the list of tunable hyperparameters
-for each block in the pipeline, ready to be passed to any hyperparameter tuning library such
-as `BTB`_.
-
-.. ipython:: python
-
-    pipeline.get_tunable_hyperparameters()
-
-Setting Hyperparameters
------------------------
-
-Modifying the hyperparameters of an already instantiated pipeline can be done using the
-`set_hyperparameters method`_, which expects a dictionary with the same format as the returned
-by the `get_hyperparameters method`_.
-
-Note that if a subset of the hyperparameters is passed, only these will be modified, and the
-other ones will remain unmodified.
-
-.. ipython:: python
-
-    new_hyperparameters = {
-        'sklearn.ensemble.RandomForestClassifier#1': {
-            'max_depth': 15
-        }
-    }
-    pipeline.set_hyperparameters(new_hyperparameters)
-    hyperparameters = pipeline.get_hyperparameters()
-    hyperparameters['sklearn.ensemble.RandomForestClassifier#1']['max_depth']
-
-Making predictions
-------------------
-
-Once we have created the pipeline with the desired hyperparameters we can fit it
-and then use it to make predictions on new data.
-
-To do this, we first call the ``fit`` method passing the training data and the corresponding
-labels.
-
-.. ipython:: python
-
-    from mlblocks.datasets import load_personae
-    dataset = load_personae()
-    X_train, X_test, y_train, y_test = dataset.get_splits(1)
-    pipeline.fit(X_train, y_train)
-
-Once we have fitted our model to our data, we can call the ``predict`` method passing new data
-to obtain predictions from the pipeline.
-
-.. ipython:: python
-
-    predictions = pipeline.predict(X_test)
-    predictions
-    dataset.score(y_test, predictions)
-
-.. _you have already installed them: install.html#additional-dependencies
-.. _MLPipeline class: ../api_reference.html#mlblocks.MLPipeline
-.. _get_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_hyperparameters
-.. _hyperparameters: ../advanced_usage/hyperparameters.html
-.. _MLBlocks JSON Annotations: ../advanced_usage/primitives.html#json-annotations
-.. _get_tunable_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_tunable_hyperparameters
-.. _BTB: https://github.com/HDI-Project/BTB
-.. _set_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.set_hyperparameters
diff --git a/docs/index.rst b/docs/index.rst
index c3655b3c..7a6fa800 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,53 +1,10 @@
-What is MLBlocks?
-=================
-
-.. image:: images/mlblocks-logo.png
-   :width: 300 px
-   :alt: MLBlocks
-   :align: center
-
-MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning
-tools developed in Python, whether they are custom developments or belong to third party
-libraries, and build Pipelines out of them that can be fitted and then used to make predictions.
-
-This is achieved by providing a simple and intuitive annotation language that allows the
-user to specify how to integrate with each tool, here called primitives, in order to provide
-a common uniform interface to each one of them.
-
-At a high level:
-
-* Each available primitive has been annotated using a standardized JSON file that specifies its
-  native interface, as well as which hyperparameters can be used to tune its behavior.
-* A list of primitives that will be combined into a pipeline is provided by the user, optionally
-  passing along the hyperparameters to use for each primitive.
-* An MLBlock instance is build for each primitive, offering a common interface for all of them.
-* The MLBlock instances are then combined into an MLPipeline instance, able to run them all in
-  the right order, passing the output from each one as input to the next one.
-* The training data is passed to the `MLPipeline.fit` method, which sequentially fits each
-  MLBlock instance following the JSON annotation specification.
-* The data used to make predictions is passed to the `MLPipeline.predict` method, which uses each
-  MLBlock sequentially to obtain the desired predictions.
-
-History
--------
-
-In its first iteration in 2015, MLBlocks was designed for only multi table, multi entity temporal
-data. A good reference to see our design rationale at that time is Bryan Collazo’s thesis:
-
-* `Machine learning blocks`_.
-  Bryan Collazo. Masters thesis, MIT EECS, 2015.
-
-With recent availability of a multitude of libraries and tools, we decided it was time to integrate
-them and expand the library to address other data types: images, text, graph, time series and
-integrate with deep learning libraries.
+.. include:: readme.rst
 
 .. toctree::
-   :caption: Getting Started
-   :titlesonly:
+   :hidden:
+   :maxdepth: 2
 
-   self
-   getting_started/install
-   getting_started/quickstart
+   Overview <readme>
 
 .. toctree::
    :caption: Advanced Usage
@@ -89,6 +46,3 @@ Indices and tables
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
-
-.. _Machine learning blocks: https://github.com/HDI-Project/mlblocks
-.. _tarball: https://github.com/HDI-Project/mlblocks/tarball/master
diff --git a/docs/readme.rst b/docs/readme.rst
new file mode 100644
index 00000000..97d49585
--- /dev/null
+++ b/docs/readme.rst
@@ -0,0 +1 @@
+.. mdinclude:: ../README.md

From 753426e5c2ec994fe8f9ca9ab928dde9380f9bf0 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sun, 16 Feb 2020 17:34:55 -0500
Subject: [PATCH 086/160] Update devel dependencies

---
 .../tutorials/3. Setting MLPipeline Hyperparameters.ipynb     | 4 ++--
 setup.py                                                      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
index 0914e806..725226f7 100644
--- a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb	
+++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb	
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 2. Setting MLPipeline Hyperparameters\n",
+    "# Setting MLPipeline Hyperparameters\n",
     "\n",
     "In this short guide we will see how to modify the hyperparameters\n",
     "of an MLPipeline in order to modify its behavior or performance.\n",
@@ -429,7 +429,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.9"
   }
  },
  "nbformat": 4,
diff --git a/setup.py b/setup.py
index 6045c574..ddb0081e 100644
--- a/setup.py
+++ b/setup.py
@@ -92,7 +92,7 @@
     ],
     description="Pipelines and primitives for machine learning and data science.",
     extras_require={
-        'dev': development_requires + tests_require,
+        'dev': development_requires + tests_require + examples_require,
         'test': tests_require,
         'examples': examples_require,
     },

From cd68389890109d055d05eac3ba9aefbd6e94ad1f Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sun, 16 Feb 2020 17:36:59 -0500
Subject: [PATCH 087/160] Rename notebook

---
 ...eline-Copy1.ipynb => 4. Saving and Loading a Pipeline.ipynb} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename examples/tutorials/{4. Saving and Loading a Pipeline-Copy1.ipynb => 4. Saving and Loading a Pipeline.ipynb} (99%)

diff --git a/examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb
similarity index 99%
rename from examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb
rename to examples/tutorials/4. Saving and Loading a Pipeline.ipynb
index f8a0a5b3..01a58cd5 100644
--- a/examples/tutorials/4. Saving and Loading a Pipeline-Copy1.ipynb	
+++ b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb	
@@ -180,7 +180,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.9"
   }
  },
  "nbformat": 4,

From 6e31824e61420038e9a180c0330ab2f745dbd2a2 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sun, 16 Feb 2020 18:15:23 -0500
Subject: [PATCH 088/160] Test readme using rundoc

---
 Makefile  | 4 ++++
 README.md | 2 +-
 setup.py  | 8 +++++---
 tox.ini   | 8 +++++++-
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index bfc1a5f6..eb422682 100644
--- a/Makefile
+++ b/Makefile
@@ -114,6 +114,10 @@ lint-docs: ## check docs formatting with doc8 and pydocstyle
 test: ## run tests quickly with the default Python
 	python -m pytest --cov=mlblocks
 
+.PHONY: test-readme
+test-readme: ## run the readme snippets
+	rundoc run --single-session python3 -t python3 README.md
+
 .PHONY: test-all
 test-all: ## run tests on every Python version with tox
 	tox -r
diff --git a/README.md b/README.md
index f3a6e3d7..3f13fec0 100644
--- a/README.md
+++ b/README.md
@@ -120,7 +120,7 @@ Micah J. Smith, Carles Sala, James Max Kanter, and Kalyan Veeramachaneni. ["The
 Harnessing the ML Ecosystem for Effective System Development."](https://arxiv.org/abs/1905.08942) arXiv
 Preprint 1905.08942. 2019.
 
-``` bibtex
+```bibtex
 @article{smith2019mlbazaar,
   author = {Smith, Micah J. and Sala, Carles and Kanter, James Max and Veeramachaneni, Kalyan},
   title = {The Machine Learning Bazaar: Harnessing the ML Ecosystem for Effective System Development},
diff --git a/setup.py b/setup.py
index ddb0081e..b6ba498e 100644
--- a/setup.py
+++ b/setup.py
@@ -28,10 +28,12 @@
     'pytest>=3.4.2',
     'pytest-cov>=2.6.0',
     'mlprimitives>=0.2,<0.3',
-    'urllib3>=1.20,<1.25',
-    'setuptools>=41.0.0',
+    # 'urllib3>=1.20,<1.25',
+    # 'setuptools>=41.0.0',
     'numpy<1.17',
-    'python-dateutil<2.8.1,>=2.1',
+    # 'python-dateutil<2.8.1,>=2.1',
+    'rundoc>=0.4.3',
+    'prompt-toolkit>=2.0,<3.0',
 ]
 
 
diff --git a/tox.ini b/tox.ini
index 666eeab0..1b8a777e 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py35, py36, lint, docs
+envlist = py35, py36, lint, docs, readme
 
 
 [travis]
@@ -29,3 +29,9 @@ skipsdist = true
 extras = dev
 commands =
     /usr/bin/env make docs
+
+
+[testenv:readme]
+skipsdist = true
+commands =
+    /usr/bin/env make test-readme

From 507564de001731915692ee698aa33eda49318b75 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sun, 16 Feb 2020 18:22:58 -0500
Subject: [PATCH 089/160] Fix dependencies

---
 setup.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index b6ba498e..a4fcc7a3 100644
--- a/setup.py
+++ b/setup.py
@@ -28,10 +28,8 @@
     'pytest>=3.4.2',
     'pytest-cov>=2.6.0',
     'mlprimitives>=0.2,<0.3',
-    # 'urllib3>=1.20,<1.25',
-    # 'setuptools>=41.0.0',
+    'setuptools>=41.0.0',
     'numpy<1.17',
-    # 'python-dateutil<2.8.1,>=2.1',
     'rundoc>=0.4.3',
     'prompt-toolkit>=2.0,<3.0',
 ]

From 3169f7ac4911b272d5d23dd89edaa347298dfc71 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sun, 16 Feb 2020 18:39:53 -0500
Subject: [PATCH 090/160] Fix readme aspect in the docs

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3f13fec0..793c55f5 100644
--- a/README.md
+++ b/README.md
@@ -16,12 +16,16 @@ Pipelines and Primitives for Machine Learning and Data Science.
 [![CodeCov](https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/HDI-Project/MLBlocks)
 [![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks)
 
-# Overview
+---
+
+# MLBlocks
 
 * Free software: MIT license
 * Documentation: https://HDI-Project.github.io/MLBlocks
 * Homepage: https://github.com/HDI-Project/MLBlocks
 
+## Overview
+
 MLBlocks is a simple framework for composing end-to-end tunable Machine Learning Pipelines by
 seamlessly combining tools from any python library with a simple, common and uniform interface.
 

From dd4e7cc7a3f95792ac47f93a42f7816eb98ce1f8 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sun, 16 Feb 2020 18:42:36 -0500
Subject: [PATCH 091/160] Fix README header

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 793c55f5..3d8a02cb 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ Pipelines and Primitives for Machine Learning and Data Science.
 [![CodeCov](https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/HDI-Project/MLBlocks)
 [![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks)
 
----
+<br>
 
 # MLBlocks
 

From 9406c65f1fea6bee3441351dba93b81893a0e3f9 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sun, 16 Feb 2020 18:45:17 -0500
Subject: [PATCH 092/160] Remove misleading point

---
 examples/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 12131c95..d295414e 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -5,7 +5,7 @@ functionaliry.
 
 Within this folder you will find:
 
-* `examples.py`: Simple Python code examples of a class and a function based primitive implementation.
+<!--* `examples.py`: Simple Python code examples of a class and a function based primitive implementation.-->
 * `primitives`: Example primitive JSONs to demonstrate different MLBlocks functionalities.
 * `pipelines`: Example pipeline JSONs to demonstrate different MLBlocks functionalities.
 * `tutorials`: Collection of Jupyter Notebooks to show the usage of different MLBlocks functionalities.
@@ -45,7 +45,7 @@ cd MLBlocks
 make install-examples
 ```
 
-This will install [MLBLocks](https://github.com/HDI-Project/MLBlocks.git) and also [MLPrimitives](
+This will install [MLBLocks](https://github.com/HDI-Project/MLBlocks.git) as well as [MLPrimitives](
 https://github.com/HDI-Project/MLPrimitives.git) and [Jupyter](https://jupyter.org/).
 
 4. Enter the `examples` folder and start a Jupyter Notebook:

From e48362685ce9355e4775490e93208e31e2c6278a Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sun, 16 Feb 2020 18:54:24 -0500
Subject: [PATCH 093/160] Fix link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3d8a02cb..3d3e21cd 100644
--- a/README.md
+++ b/README.md
@@ -111,7 +111,7 @@ the pipelines using JSON annotations or build complex multi-branched pipelines,
 check our [documentation site](https://HDI-Project.github.io/MLBlocks).
 
 Also do not forget to have a look at the [notebook tutorials](
-https://github.com/D3-AI/GreenGuard/tree/master/examples/tutorials)!
+https://github.com/HDI-Project/MLBlocks/tree/master/examples/tutorials)!
 
 # Citing MLBlocks
 

From 60b5e425e844ee49dd1d6bd0b63e758cab0bbc6e Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 17 Feb 2020 11:17:00 -0500
Subject: [PATCH 094/160] Improve docs quickstart and introduction

---
 README.md       |  2 +-
 docs/index.rst  | 73 ++++++++++++++++++++++++++++++++++++++++++++++---
 docs/readme.rst |  1 -
 3 files changed, 70 insertions(+), 6 deletions(-)
 delete mode 100644 docs/readme.rst

diff --git a/README.md b/README.md
index 3d3e21cd..0f54b440 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Pipelines and Primitives for Machine Learning and Data Science.
 
 # MLBlocks
 
-* Free software: MIT license
+* Free software: [MIT license](https://github.com/HDI-Project/MLBlocks/blob/master/LICENSE)
 * Documentation: https://HDI-Project.github.io/MLBlocks
 * Homepage: https://github.com/HDI-Project/MLBlocks
 
diff --git a/docs/index.rst b/docs/index.rst
index 7a6fa800..e891230c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,10 +1,70 @@
-.. include:: readme.rst
+What is MLBlocks?
+=================
+
+.. image:: images/mlblocks-logo.png
+   :width: 300 px
+   :alt: MLBlocks
+   :align: center
+
+* Free software: `MIT license <https://github.com/HDI-Project/MLBlocks/blob/master/LICENSE>`_
+* Documentation: https://HDI-Project.github.io/MLBlocks
+* Homepage: https://github.com/HDI-Project/MLBlocks
+
+MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning
+tools developed in Python, whether they are custom developments or belong to third party
+libraries, and build Pipelines out of them that can be fitted and then used to make predictions.
+
+This is achieved by providing a simple and intuitive annotation language that allows the
+user to specify how to integrate with each tool, here called primitives, in order to provide
+a common uniform interface to each one of them.
+
+At a high level:
+
+* Each available primitive has been annotated using a standardized JSON file that specifies its
+  native interface, as well as which hyperparameters can be used to tune its behavior.
+* A list of primitives that will be combined into a pipeline is provided by the user, optionally
+  passing along the hyperparameters to use for each primitive.
+* An MLBlock instance is build for each primitive, offering a common interface for all of them.
+* The MLBlock instances are then combined into an MLPipeline instance, able to run them all in
+  the right order, passing the output from each one as input to the next one.
+* The training data is passed to the `MLPipeline.fit` method, which sequentially fits each
+  MLBlock instance following the JSON annotation specification.
+* The data used to make predictions is passed to the `MLPipeline.predict` method, which uses each
+  MLBlock sequentially to obtain the desired predictions.
+
+History
+-------
+
+In its first iteration, in 2015, MLBlocks was designed for only multi table, multi entity temporal
+data. A good reference to see our design rationale at that time is Bryan Collazo’s thesis, written
+under the supervision of Kalyan Veeramachaneni:
+
+* `Machine learning blocks`_.
+  Bryan Collazo. Masters thesis, MIT EECS, 2015.
+
+In 2018, with recent availability of a multitude of libraries and tools, we decided it was time to
+integrate them and expand the library to address other data types, like images, text, graph or
+time series, as well as introduce the usage of deep learning libraries. A second iteration of our
+work was then started by the hand of William Xue:
+
+* `A Flexible Framework for Composing End to End Machine Learning Pipelines`_.
+  William Xue. Masters thesis, MIT EECS, 2018.
+
+Later in 2018, Carles Sala joined the project to make it grow as a reliable open-source library
+that would become part of a bigger software ecosystem designed to facilitate the development of
+robust end-to-end solutions based on Machine Learning tools. This third iteration of our work
+was presented in 2019 as part of the Machine Learning Bazaar:
+
+* `The Machine Learning Bazaar: Harnessing the ML Ecosystem for Effective System Development`_.
+  Micah J. Smith, Carles Sala, James Max Kanter, and Kalyan Veeramachaneni. Sigmod 2020.
 
 .. toctree::
-   :hidden:
-   :maxdepth: 2
+   :caption: Getting Started
+   :titlesonly:
 
-   Overview <readme>
+   self
+   getting_started/install
+   getting_started/quickstart
 
 .. toctree::
    :caption: Advanced Usage
@@ -46,3 +106,8 @@ Indices and tables
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
+
+.. _Machine learning blocks: https://dai.lids.mit.edu/wp-content/uploads/2018/06/Mlblocks_Bryan.pdf
+
+.. _A Flexible Framework for Composing End to End Machine Learning Pipelines: https://dai.lids.mit.edu/wp-content/uploads/2018/12/William_MEng.pdf
+.. _The Machine Learning Bazaar\: Harnessing the ML Ecosystem for Effective System Development: https://arxiv.org/abs/1905.08942
diff --git a/docs/readme.rst b/docs/readme.rst
deleted file mode 100644
index 97d49585..00000000
--- a/docs/readme.rst
+++ /dev/null
@@ -1 +0,0 @@
-.. mdinclude:: ../README.md

From be97f0597fbbcfceb2db7643550e4f45502b46a2 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 17 Feb 2020 11:58:37 -0500
Subject: [PATCH 095/160] Add missing docs

---
 docs/getting_started/install.rst    |  43 ++++++++++
 docs/getting_started/quickstart.rst | 127 ++++++++++++++++++++++++++++
 2 files changed, 170 insertions(+)
 create mode 100644 docs/getting_started/install.rst
 create mode 100644 docs/getting_started/quickstart.rst

diff --git a/docs/getting_started/install.rst b/docs/getting_started/install.rst
new file mode 100644
index 00000000..d2bda921
--- /dev/null
+++ b/docs/getting_started/install.rst
@@ -0,0 +1,43 @@
+.. highlight:: shell
+
+Installation
+============
+
+From PyPi
+---------
+
+The simplest and recommended way to install MLBlocks is using `pip`:
+
+.. code-block:: console
+
+    pip install mlblocks
+
+If you don't have `pip`_ installed, this `Python installation guide`_ can guide
+you through the process.
+
+.. _pip: https://pip.pypa.io
+.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
+
+Additional dependencies
+-----------------------
+
+In order to be usable, MLBlocks requires a compatible primitives library.
+
+The official library, required in order to follow the MLBlocks tutorials and documentation examples,
+is `MLPrimitives`_, which you can install with this command:
+
+.. code-block:: console
+
+    pip install mlprimitives
+
+.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives
+
+Install for development
+-----------------------
+
+If you are installing **MLBlocks** in order to modify its code, the installation must be done
+from its sources, in the editable mode, and also including some additional dependencies in
+order to be able to run the tests and build the documentation. Instructions about this process
+can be found in the `Contributing guide`_.
+
+.. _Contributing guide: ../contributing.html#get-started
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
new file mode 100644
index 00000000..b55223dd
--- /dev/null
+++ b/docs/getting_started/quickstart.rst
@@ -0,0 +1,127 @@
+Quickstart
+==========
+
+Below is a short tutorial that will show you how to get started using **MLBlocks**.
+
+In this tutorial we will learn how to:
+
+* Create a pipeline using multiple primitives
+* Obtain the list of tunable hyperparameters from the pipeline
+* Specify hyperparameters for each primitive in the pipeline
+* Fit the pipeline using training data
+* Use the pipeline to make predictions from new data
+
+.. note:: Some additional dependencies are required in order to run this Quickstart.
+          Make sure that `you have already installed them`_.
+
+Creating a pipeline
+-------------------
+
+With MLBlocks, creating a pipeline is as simple as specifying a list of primitives and passing
+them to the `MLPipeline class`_:
+
+.. ipython:: python
+
+    from mlblocks import MLPipeline
+    primitives = [
+        'mlprimitives.custom.preprocessing.ClassEncoder',
+        'mlprimitives.custom.feature_extraction.CategoricalEncoder',
+        'sklearn.impute.SimpleImputer',
+        'xgboost.XGBClassifier',
+        'mlprimitives.custom.preprocessing.ClassDecoder'
+    ]
+    pipeline = MLPipeline(primitives)
+
+Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary and
+passing them as the ``init_params`` argument:
+
+.. ipython:: python
+
+    init_params = {
+        'sklearn.impute.SimpleImputer': {
+            'strategy': 'median'
+        }
+    }
+    pipeline = MLPipeline(primitives, init_params=init_params)
+
+Once the pipeline has been instantiated, we can easily see what `hyperparameters`_ have been set
+for each block, by calling the `get_hyperparameters method`_.
+
+The output of this method is a dictionary which has the name of each block as keys and
+a dictionary with the `hyperparameters`_ of the corresponding block as values.
+
+.. ipython:: python
+
+    pipeline.get_hyperparameters()
+
+Tunable Hyperparameters
+-----------------------
+
+One of the main features of `MLBlocks JSON Annotations`_ is the possibility to indicate
+the type and possible values that each primitive hyperparameter accepts.
+
+The list of possible hyperparameters and their details can easily be obtained from the pipeline
+instance by calling its `get_tunable_hyperparameters method`_.
+
+The output of this method is a dictionary that contains the list of tunable hyperparameters
+for each block in the pipeline, ready to be passed to any hyperparameter tuning library such
+as `BTB`_.
+
+.. ipython:: python
+
+    pipeline.get_tunable_hyperparameters()
+
+Setting Hyperparameters
+-----------------------
+
+Modifying the hyperparameters of an already instantiated pipeline can be done using the
+`set_hyperparameters method`_, which expects a dictionary with the same format as the returned
+by the `get_hyperparameters method`_.
+
+Note that if a subset of the hyperparameters is passed, only these will be modified, and the
+other ones will remain unmodified.
+
+.. ipython:: python
+
+    new_hyperparameters = {
+        'xgboost.XGBClassifier#1': {
+            'max_depth': 15
+        }
+    }
+    pipeline.set_hyperparameters(new_hyperparameters)
+    hyperparameters = pipeline.get_hyperparameters()
+    hyperparameters['xgboost.XGBClassifier#1']['max_depth']
+
+Making predictions
+------------------
+
+Once we have created the pipeline with the desired hyperparameters we can fit it
+and then use it to make predictions on new data.
+
+To do this, we first call the ``fit`` method passing the training data and the corresponding
+labels.
+
+.. ipython:: python
+
+    from mlprimitives.datasets import load_census
+    dataset = load_census()
+    X_train, X_test, y_train, y_test = dataset.get_splits(1)
+    pipeline.fit(X_train, y_train)
+
+Once we have fitted our model to our data, we can call the ``predict`` method passing new data
+to obtain predictions from the pipeline.
+
+.. ipython:: python
+
+    predictions = pipeline.predict(X_test)
+    predictions
+    dataset.score(y_test, predictions)
+
+.. _you have already installed them: install.html#additional-dependencies
+.. _MLPipeline class: ../api_reference.html#mlblocks.MLPipeline
+.. _get_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_hyperparameters
+.. _hyperparameters: ../advanced_usage/hyperparameters.html
+.. _MLBlocks JSON Annotations: ../advanced_usage/primitives.html#json-annotations
+.. _get_tunable_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_tunable_hyperparameters
+.. _BTB: https://github.com/HDI-Project/BTB
+.. _set_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.set_hyperparameters

From c7194847264d2b85e183073aafd401ea8367c8ba Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 17 Feb 2020 13:40:59 -0500
Subject: [PATCH 096/160] Update quickstart description

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0f54b440..6fb2f56c 100644
--- a/README.md
+++ b/README.md
@@ -79,8 +79,10 @@ pip install mlprimitives
 
 # Quickstart
 
-Below there is a short example about how to use **MLBlocks** to solve a prediction problem
-using the primitives and pipelines from [MLPrimitives](https://github.com/HDI-Project/MLPrimitives).
+Below there is a short example about how to use **MLBlocks** to solve the [Adult Census
+Dataset](https://archive.ics.uci.edu/ml/datasets/Adult) classification problem using a
+pipeline which combines primitives from [MLPrimitives](https://github.com/HDI-Project/MLPrimitives),
+[scikit-learn](https://scikit-learn.org/) and [xgboost](https://xgboost.readthedocs.io/).
 
 ```python3
 from mlblocks import MLPipeline

From 78a47d6cda812406b48c71ba62cb7d5c34d74250 Mon Sep 17 00:00:00 2001
From: JDTheRipperPC <jdtheripperpc@gmail.com>
Date: Thu, 20 Feb 2020 12:10:45 +0100
Subject: [PATCH 097/160] Add Development status badge

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 6fb2f56c..fa4260d5 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@
 Pipelines and Primitives for Machine Learning and Data Science.
 </p>
 
+[![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
 [![PyPi](https://img.shields.io/pypi/v/mlblocks.svg)](https://pypi.python.org/pypi/mlblocks)
 [![Travis](https://travis-ci.org/HDI-Project/MLBlocks.svg?branch=master)](https://travis-ci.org/HDI-Project/MLBlocks)
 [![CodeCov](https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/HDI-Project/MLBlocks)
@@ -21,6 +22,7 @@ Pipelines and Primitives for Machine Learning and Data Science.
 # MLBlocks
 
 * Free software: [MIT license](https://github.com/HDI-Project/MLBlocks/blob/master/LICENSE)
+* Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
 * Documentation: https://HDI-Project.github.io/MLBlocks
 * Homepage: https://github.com/HDI-Project/MLBlocks
 

From 0a9205b08c426e1e3d63fd75a0fc39c855fa176f Mon Sep 17 00:00:00 2001
From: Erica Chiu <ejchiu@mit.edu>
Date: Sun, 23 Feb 2020 22:42:46 -0500
Subject: [PATCH 098/160] Add diagram

---
 mlblocks/mlpipeline.py | 183 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 177 insertions(+), 6 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index dcfc8a0b..051e8338 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -8,6 +8,7 @@
 import warnings
 from collections import Counter, OrderedDict, defaultdict
 from copy import deepcopy
+from graphviz import Digraph
 
 import numpy as np
 
@@ -250,8 +251,7 @@ def _get_str_output(self, output):
     def get_inputs(self, fit=True):
         """Get a relation of all the input variables required by this pipeline.
 
-        The result is a dictionary that maps each variable name with their
-        specified information.
+        The result is a list contains all of the input variables.
         Optionally include the fit arguments.
 
         Args:
@@ -259,9 +259,8 @@ def get_inputs(self, fit=True):
                 Optional argument to include fit arguments or not. Defaults to ``True``.
 
         Returns:
-            dictionary:
-                A dictionary mapping every input variable's name to a dictionary
-                specifying the information corresponding to that input variable.
+            list:
+                List of dictionaries specifying all the input variables.
                 Each dictionary contains the entry ``name``, as
                 well as any other metadata that may have been included in the
                 pipeline inputs specification.
@@ -292,7 +291,19 @@ def get_inputs(self, fit=True):
                 )
                 inputs.update(fit_inputs)
 
-        return inputs
+        inputs_list=[]
+        if 'X' in inputs:
+            inputs_list.append(inputs['X'])
+            del inputs['X']
+
+        if 'y' in inputs:
+            inputs_list.append(inputs['y'])
+            del inputs['y']
+
+        for input_value in inputs.values():
+            inputs_list.append(input_value)
+
+        return inputs_list
 
     def get_outputs(self, outputs='default'):
         """Get the list of output variables that correspond to the specified outputs.
@@ -857,6 +868,166 @@ def to_dict(self):
             'outputs': self.outputs,
         }
 
+    def _get_simple_block_name(self, block_name):
+        full_name = block_name.split("#")[0]
+        simple_name = full_name.split(".")[-1]
+        return simple_name
+
+    def _get_context_name_from_variable(self, variable_name):
+        block_name = variable_name.split("#")[0]
+        rest = variable_name[len(block_name)+1:]
+        block_index = rest.split(".")[0]
+        context_name = rest[len(block_index)+1:]
+        if len(context_name) == 0:
+            raise ValueError("Invalid variable name")
+        return context_name
+
+
+    def get_diagram(self, fit=True, outputs='default', image_path=None):
+        """
+        Creates a png diagram for the pipeline, showing Pipeline Steps,
+        Pipeline Inputs and Outputs, and block inputs and outputs.
+
+        If strings are given, they can either be one of the named outputs that have
+        been specified on the pipeline definition or a full variable specification
+        following the format ``{block-name}.{variable-name}``.
+
+        Args:
+            fit (bool):
+                Optional argument to include fit arguments or not. Defaults to `True`.
+
+            outputs (str, int, or list[str or int]):
+                Single or list of output specifications.
+
+            image_path (str):
+                Optional argument for the location at which to save the file.
+                Defaults to `None`, which returns a `graphviz.Digraph` object instead of saving the file.
+
+        Returns:
+            None or `graphviz.Digraph` object:
+                * `graphviz.Digraph` contains the information about the Pipeline Diagram
+        """
+
+        diagram = Digraph(format='png')
+        diagram.attr('graph', splines='ortho')
+        diagram.attr('node', shape='box', penwidth='1')
+
+        # Blocks
+        for block_name in self.blocks.keys():
+            simple_name = self._get_simple_block_name(block_name)
+            diagram.node(block_name, simple_name)
+
+        variables = {}
+
+        # Inputs
+        inputs = self.get_inputs(fit)
+        input_variables = []
+        with diagram.subgraph(name="cluster_inputs") as cluster:
+            cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
+            cluster.attr('node', penwidth='0', fontsize='24')
+            cluster.node('Input', 'Input', fontsize='14')
+            cluster.attr('edge', penwidth='0', arrowhead='none')
+            for input_value in inputs:
+                input_name = input_value['name']
+                variables[input_name] = input_name+'_input'
+                input_variables.append(input_name)
+                cluster.node(variables[input_name], input_name)
+                cluster.edge('Input', variables[input_name])
+
+            with cluster.subgraph() as input_variables_subgraph:
+                input_variables_subgraph.attr(None, rank='same')
+                for index in range(1, len(input_variables)):
+                    input_variables_subgraph.edge(variables[input_variables[index-1]],
+                                                  variables[input_variables[index]])
+                    input_variables_subgraph.attr(None, rankdir='LR')
+
+        # Outputs
+        outputs = self.get_outputs(outputs)
+        output_variables = []
+        with diagram.subgraph(name="cluster_outputs") as cluster:
+            cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
+            cluster.attr('node', penwidth='0', fontsize='24')
+            cluster.node('Output', 'Output', fontsize='14')
+            cluster.attr('edge', penwidth='0', arrowhead='none')
+            for output in outputs:
+                try:
+                    variable_name = self._get_context_name_from_variable(output['variable'])
+                except ValueError:
+                    raise NotImplementedError('Can not deal with this type of output specification')
+                cluster.node(variable_name+'_output', variable_name)
+                output_variables.append(variable_name)
+                cluster.edge(output_variables[-1] + '_output', 'Output')
+            with cluster.subgraph() as output_variables_subgraph:
+                output_variables_subgraph.attr(None, rank='same')
+                for index in range(1, len(output_variables)):
+                    output_variables_subgraph.edge(output_variables[index-1]+'_output', output_variables[index]+'_output')
+                output_variables_subgraph.attr(None, rankdir='LR')
+
+        cluster_edges = set()
+
+        # Variables
+        diagram.attr('node', fontsize='14', penwidth='0')
+        diagram.attr('edge', penwidth='1')
+        for block_name, block in self.blocks.items():
+            # Inputs
+            input_names = self.input_names.get(block_name, dict())
+            input_variables = block.produce_args
+            if fit:
+                for input_variable in block.fit_args:
+                    if input_variable not in input_variables:
+                        input_variables.append(input_variable)
+            for input_variable in input_variables:
+                input_variable_name = input_variable['name']
+                if input_variable_name in input_names:
+                    diagram.node(block_name+' '+input_variable_name, '('+input_variable_name+')', fontcolor='blue')
+                    original_variable_name = input_names[input_variable_name]
+                    diagram.edge(variables[original_variable_name], block_name+' '+input_variable_name)
+                    cluster_edges.add((block_name+' '+input_variable_name, block_name))
+                else:
+                    diagram.edge(variables[input_variable_name], block_name)
+
+            # Outputs
+            output_names = self.output_names.get(block_name, dict())
+            for output_variable in block.produce_output:
+                output_variable_name = output_variable['name']
+                if output_variable_name in output_names:
+                    diagram.node(block_name+' '+output_variable_name, '('+output_variable_name+')', fontcolor='red')
+                    cluster_edges.add((block_name, block_name+' '+output_variable_name))
+                    new_variable_name = output_names[output_variable_name]
+                    diagram.node(block_name+' '+new_variable_name, new_variable_name)
+                    diagram.edge(block_name+' '+output_variable_name, block_name+' '+new_variable_name, arrowhead='none')
+                    variables[new_variable_name] = block_name+' '+new_variable_name
+                else:
+                    diagram.node(block_name+' '+output_variable_name, output_variable_name)
+                    diagram.edge(block_name, block_name+' '+output_variable_name, arrowhead='none')
+                    variables[output_variable_name] = block_name+' '+output_variable_name
+
+        # Connection to output variables
+        for output_variable in output_variables:
+            variable_block = variables[output_variable]
+            diagram.edge(variable_block, output_variable+'_output')
+
+        # Alignment
+        with diagram.subgraph() as alignment:
+            alignment.attr('graph', penwidth='0')
+            alignment.attr('edge', penwidth='0', arrowhead='none')
+            for index in range(1, len(self.blocks)):
+                alignment.edge(self._get_block_name(index-1), self._get_block_name(index))
+
+            # Optional names
+            alignment.attr('edge', len='1', minlen='1', penwidth='1')
+
+            for first_block, second_block in cluster_edges:
+                with alignment.subgraph(name='cluster_'+first_block+second_block) as cluster:
+                    cluster.edge(first_block, second_block)
+
+        if image_path:
+            diagram.render(filename='Diagram', directory=image_path, cleanup=True, format='png')
+        else:
+            return diagram
+
+
+
     def save(self, path):
         """Save the specification of this MLPipeline in a JSON file.
 

From 248ccd7b4bdb5f5e9783283ec92b3f0c78d88dff Mon Sep 17 00:00:00 2001
From: Erica Chiu <ejchiu@mit.edu>
Date: Tue, 17 Mar 2020 11:24:48 -0700
Subject: [PATCH 099/160] Setup update

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1e8ef2ad..93b78ac3 100644
--- a/setup.py
+++ b/setup.py
@@ -15,6 +15,7 @@
 
 
 install_requires = [
+    'graphviz>=0.9,<1',
 ]
 
 
@@ -44,7 +45,6 @@
     'm2r>=0.2.0',
     'Sphinx>=1.7.1',
     'sphinx_rtd_theme>=0.2.4',
-    'graphviz>=0.9',
     'ipython>=6.5.0',
     'matplotlib>=2.2.3',
     'autodocsumm>=0.1.10',

From 0fd635c249f555217e2cf5d0fa171571a558d718 Mon Sep 17 00:00:00 2001
From: Erica Chiu <ejchiu@mit.edu>
Date: Sat, 23 May 2020 16:54:15 -0700
Subject: [PATCH 100/160] Fix test and lint errors

---
 mlblocks/mlpipeline.py | 369 ++++++++++++++++++++++++++++++-----------
 1 file changed, 268 insertions(+), 101 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 051e8338..128932f6 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -260,7 +260,7 @@ def get_inputs(self, fit=True):
 
         Returns:
             list:
-                List of dictionaries specifying all the input variables.
+                Dictionary specifying all the input variables.
                 Each dictionary contains the entry ``name``, as
                 well as any other metadata that may have been included in the
                 pipeline inputs specification.
@@ -291,19 +291,7 @@ def get_inputs(self, fit=True):
                 )
                 inputs.update(fit_inputs)
 
-        inputs_list=[]
-        if 'X' in inputs:
-            inputs_list.append(inputs['X'])
-            del inputs['X']
-
-        if 'y' in inputs:
-            inputs_list.append(inputs['y'])
-            del inputs['y']
-
-        for input_value in inputs.values():
-            inputs_list.append(input_value)
-
-        return inputs_list
+        return inputs
 
     def get_outputs(self, outputs='default'):
         """Get the list of output variables that correspond to the specified outputs.
@@ -869,67 +857,80 @@ def to_dict(self):
         }
 
     def _get_simple_block_name(self, block_name):
+        """
+        Gets the most readable, simplest version of the block name,
+        without the number of the block or excess modifiers.
+
+        Args:
+            block_name (str):
+                Name of the block whose simple name is being extracted.
+        Returns:
+            str:
+                block name stripped of number and other modifiers.
+        """
         full_name = block_name.split("#")[0]
         simple_name = full_name.split(".")[-1]
         return simple_name
 
     def _get_context_name_from_variable(self, variable_name):
+        """
+        Gets the name of the context from the given variable.
+
+        Args:
+            variable_name (str):
+                Name of the variable.
+        Returns:
+            str:
+                Name of the context of the variable.
+        """
         block_name = variable_name.split("#")[0]
-        rest = variable_name[len(block_name)+1:]
+        rest = variable_name[len(block_name) + 1:]
         block_index = rest.split(".")[0]
-        context_name = rest[len(block_index)+1:]
+        context_name = rest[len(block_index) + 1:]
         if len(context_name) == 0:
             raise ValueError("Invalid variable name")
         return context_name
 
-
-    def get_diagram(self, fit=True, outputs='default', image_path=None):
+    def _make_diagram_blocks(self, diagram):
         """
-        Creates a png diagram for the pipeline, showing Pipeline Steps,
-        Pipeline Inputs and Outputs, and block inputs and outputs.
-
-        If strings are given, they can either be one of the named outputs that have
-        been specified on the pipeline definition or a full variable specification
-        following the format ``{block-name}.{variable-name}``.
+        Modifies the diagram to add blocks of the pipeline as visible nodes in the diagram.
 
         Args:
-            fit (bool):
-                Optional argument to include fit arguments or not. Defaults to `True`.
-
-            outputs (str, int, or list[str or int]):
-                Single or list of output specifications.
-
-            image_path (str):
-                Optional argument for the location at which to save the file.
-                Defaults to `None`, which returns a `graphviz.Digraph` object instead of saving the file.
-
-        Returns:
-            None or `graphviz.Digraph` object:
-                * `graphviz.Digraph` contains the information about the Pipeline Diagram
+            diagram (Digraph):
+                Diagram to be modified.
         """
-
-        diagram = Digraph(format='png')
-        diagram.attr('graph', splines='ortho')
         diagram.attr('node', shape='box', penwidth='1')
-
-        # Blocks
         for block_name in self.blocks.keys():
             simple_name = self._get_simple_block_name(block_name)
             diagram.node(block_name, simple_name)
 
-        variables = {}
+    def _make_diagram_inputs(self, diagram, fit):
+        """
+        Modifies the diagram to add the inputs of the pipeline
 
-        # Inputs
-        inputs = self.get_inputs(fit)
+        Args:
+            diagram (Digraph):
+                Diagram to be modified.
+
+            fit (bool):
+                `True` if including fitted arguments, `False` otherwise.
+
+        Returns:
+            dict:
+                Dictionary of variables mapped to their label for their node in the pipeline.
+        """
+        diagram.attr('node', shape='box')
+        variables = {}
         input_variables = []
+        inputs = self.get_inputs(fit)
+
         with diagram.subgraph(name="cluster_inputs") as cluster:
             cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
             cluster.attr('node', penwidth='0', fontsize='24')
-            cluster.node('Input', 'Input', fontsize='14')
             cluster.attr('edge', penwidth='0', arrowhead='none')
-            for input_value in inputs:
-                input_name = input_value['name']
-                variables[input_name] = input_name+'_input'
+            cluster.node('Input', 'Input', fontsize='14')
+            for input_name in inputs:
+                variables[input_name] = input_name + '_input'
                 input_variables.append(input_name)
                 cluster.node(variables[input_name], input_name)
                 cluster.edge('Input', variables[input_name])
@@ -937,97 +938,263 @@ def get_diagram(self, fit=True, outputs='default', image_path=None):
             with cluster.subgraph() as input_variables_subgraph:
                 input_variables_subgraph.attr(None, rank='same')
                 for index in range(1, len(input_variables)):
-                    input_variables_subgraph.edge(variables[input_variables[index-1]],
-                                                  variables[input_variables[index]])
+                    input_variables_subgraph.edge(
+                        variables[input_variables[index - 1]],
+                        variables[input_variables[index]])
                     input_variables_subgraph.attr(None, rankdir='LR')
 
-        # Outputs
-        outputs = self.get_outputs(outputs)
+        return variables
+
+    def _make_diagram_outputs(self, diagram, outputs):
+        """
+        Modifies the diagram to add outputs of the pipeline in order from left to right.
+
+        Args:
+            diagram (Digraph):
+                Diagram to be modified.
+
+            outputs (str, int, or list[str or int]):
+                Single or list of output specifications.
+
+        Returns:
+            list[str]:
+                List of the human-readable names of the output variables in order
+        """
+        diagram.attr('node', shape='box')
         output_variables = []
+        outputs_vars = self.get_outputs(outputs)
+
         with diagram.subgraph(name="cluster_outputs") as cluster:
             cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
             cluster.attr('node', penwidth='0', fontsize='24')
-            cluster.node('Output', 'Output', fontsize='14')
             cluster.attr('edge', penwidth='0', arrowhead='none')
-            for output in outputs:
+            cluster.node('Output', 'Output', fontsize='14')
+            for output in outputs_vars:
                 try:
                     variable_name = self._get_context_name_from_variable(output['variable'])
                 except ValueError:
-                    raise NotImplementedError('Can not deal with this type of output specification')
-                cluster.node(variable_name+'_output', variable_name)
+                    raise NotImplementedError(
+                        'Can not deal with this type of output specification')
+                cluster.node(variable_name + '_output', variable_name)
                 output_variables.append(variable_name)
                 cluster.edge(output_variables[-1] + '_output', 'Output')
             with cluster.subgraph() as output_variables_subgraph:
                 output_variables_subgraph.attr(None, rank='same')
                 for index in range(1, len(output_variables)):
-                    output_variables_subgraph.edge(output_variables[index-1]+'_output', output_variables[index]+'_output')
+                    output_variables_subgraph.edge(output_variables[index - 1] + '_output',
+                                                   output_variables[index] + '_output')
                 output_variables_subgraph.attr(None, rankdir='LR')
 
-        cluster_edges = set()
+        return output_variables
 
-        # Variables
+    def _make_diagram_variables(self, diagram, fit, variables):
+        """
+        Modifies the diagram to add main variables of the pipeline.
+
+        Args:
+            diagram (Digraph):
+                Diagram to be modified
+
+            fit (bool):
+                `True` if including fitted arguments, `False` otherwise.
+
+            variables (dict):
+                Dictionary of variables mapped to their label for their node in the pipeline.
+
+        Returns:
+            set:
+                Set of tuples of the alternative variable name and its corresponding block
+                in order
+        """
         diagram.attr('node', fontsize='14', penwidth='0')
         diagram.attr('edge', penwidth='1')
+        cluster_edges = set()
+
         for block_name, block in self.blocks.items():
-            # Inputs
-            input_names = self.input_names.get(block_name, dict())
-            input_variables = block.produce_args
-            if fit:
-                for input_variable in block.fit_args:
-                    if input_variable not in input_variables:
-                        input_variables.append(input_variable)
-            for input_variable in input_variables:
-                input_variable_name = input_variable['name']
-                if input_variable_name in input_names:
-                    diagram.node(block_name+' '+input_variable_name, '('+input_variable_name+')', fontcolor='blue')
-                    original_variable_name = input_names[input_variable_name]
-                    diagram.edge(variables[original_variable_name], block_name+' '+input_variable_name)
-                    cluster_edges.add((block_name+' '+input_variable_name, block_name))
-                else:
-                    diagram.edge(variables[input_variable_name], block_name)
-
-            # Outputs
-            output_names = self.output_names.get(block_name, dict())
-            for output_variable in block.produce_output:
-                output_variable_name = output_variable['name']
-                if output_variable_name in output_names:
-                    diagram.node(block_name+' '+output_variable_name, '('+output_variable_name+')', fontcolor='red')
-                    cluster_edges.add((block_name, block_name+' '+output_variable_name))
-                    new_variable_name = output_names[output_variable_name]
-                    diagram.node(block_name+' '+new_variable_name, new_variable_name)
-                    diagram.edge(block_name+' '+output_variable_name, block_name+' '+new_variable_name, arrowhead='none')
-                    variables[new_variable_name] = block_name+' '+new_variable_name
-                else:
-                    diagram.node(block_name+' '+output_variable_name, output_variable_name)
-                    diagram.edge(block_name, block_name+' '+output_variable_name, arrowhead='none')
-                    variables[output_variable_name] = block_name+' '+output_variable_name
+            self._make_diagram_variables_input_block(diagram, fit, variables, cluster_edges, block,
+                                                     block_name)
+            self._make_diagram_variables_output_block(diagram, variables, cluster_edges, block,
+                                                      block_name)
+        return cluster_edges
+
+    def _make_diagram_variables_input_block(self, diagram, fit, variables, cluster_edges, block,
+                                            block_name):
+        """
+        Modifies the diagram to add input variables the corresponding block of the pipeline.
+
+        Args:
+            diagram (Digraph):
+                Diagram to be modified
+
+            fit (bool):
+                `True` if including fitted arguments, `False` otherwise.
+
+            variables (dict):
+                Dictionary of variables mapped to their label for their node in the pipeline.
+
+            cluster_edges (set):
+                Set of tuples that may contain some alternative variable names and its
+                corresponding block in order
+
+            block (MLBlock):
+                The block to add its input variables to the diagram
+
+            block_name (str):
+                The name of the block to add its input variables to the diagram
+
+        Returns:
+            set:
+                Set of tuples of the alternative variable name and its corresponding block
+                in order
+        """
+        input_names = self.input_names.get(block_name, dict())
+        input_variables = block.produce_args
+
+        if fit:
+            for input_variable in block.fit_args:
+                if input_variable not in input_variables:
+                    input_variables.append(input_variable)
+        for input_variable in input_variables:
+            input_variable_name = input_variable['name']
+            if input_variable_name in input_names:
+                diagram.node(block_name + ' ' + input_variable_name,
+                             '(' + input_variable_name + ')', fontcolor='blue')
+                original_variable_name = input_names[input_variable_name]
+                diagram.edge(variables[original_variable_name],
+                             block_name + ' ' + input_variable_name)
+                cluster_edges.add((block_name + ' ' + input_variable_name, block_name))
+            else:
+                diagram.edge(variables[input_variable_name], block_name)
+
+    def _make_diagram_variables_output_block(self, diagram, variables, cluster_edges, block,
+                                             block_name):
+        """
+        Modifies the diagram to add output variables the corresponding block of the pipeline.
+
+        Args:
+            diagram (Digraph):
+                Diagram to be modified
+
+            fit (bool):
+                `True` if including fitted arguments, `False` otherwise.
+
+            variables (dict):
+                Dictionary of variables mapped to their label for their node in the pipeline.
+
+            cluster_edges (set):
+                Set of tuples that may contain some alternative variable names and its
+                corresponding block in order
+
+            block (MLBlock):
+                The block to add its output variables to the diagram
+
+            block_name (str):
+                The name of the block to add its output variables to the diagram
+
+        Returns:
+            set:
+                Set of tuples of the alternative variable name and its corresponding block
+                in order
+        """
+        output_names = self.output_names.get(block_name, dict())
+        for output_variable in block.produce_output:
+            output_variable_name = output_variable['name']
+            if output_variable_name in output_names:
+                diagram.node(block_name + ' ' + output_variable_name,
+                             '(' + output_variable_name + ')', fontcolor='red')
+                cluster_edges.add((block_name, block_name + ' ' + output_variable_name))
+                new_variable_name = output_names[output_variable_name]
+                diagram.node(block_name + ' ' + new_variable_name, new_variable_name)
+                diagram.edge(block_name + ' ' + output_variable_name,
+                             block_name + ' ' + new_variable_name, arrowhead='none')
+                variables[new_variable_name] = block_name + ' ' + new_variable_name
+            else:
+                diagram.node(block_name + ' ' + output_variable_name, output_variable_name)
+                diagram.edge(block_name, block_name + ' ' + output_variable_name, arrowhead='none')
+                variables[output_variable_name] = block_name + ' ' + output_variable_name
+
+    def _make_diagram_output_connections(self, diagram, variables, output_variables):
+        """
+        Modifies the diagram to add connections to the output variables of the pipeline.
+
+        Args:
+            diagram (Digraph):
+                Diagram to be modified
 
-        # Connection to output variables
+            variables (dict):
+                Dictionary of variables mapped to their label for their node in the pipeline.
+
+            output_variables (list[str]):
+                List of the human-readable names of the output variables in order
+        """
         for output_variable in output_variables:
             variable_block = variables[output_variable]
-            diagram.edge(variable_block, output_variable+'_output')
+            diagram.edge(variable_block, output_variable + '_output')
+
+    def _make_diagram_alignment(self, diagram, cluster_edges):
+        """
+        Modifies the diagram to add alignment edges and connect alternative names to the blocks.
+
+        Args:
+            diagram (Digraph):
+                Diagram to be modified
 
-        # Alignment
+            cluster_edges (set):
+                Set of tuples that contain alternative variable names and its
+                corresponding block in order
+        """
         with diagram.subgraph() as alignment:
             alignment.attr('graph', penwidth='0')
             alignment.attr('edge', penwidth='0', arrowhead='none')
             for index in range(1, len(self.blocks)):
-                alignment.edge(self._get_block_name(index-1), self._get_block_name(index))
+                alignment.edge(self._get_block_name(index - 1), self._get_block_name(index))
 
-            # Optional names
             alignment.attr('edge', len='1', minlen='1', penwidth='1')
-
             for first_block, second_block in cluster_edges:
-                with alignment.subgraph(name='cluster_'+first_block+second_block) as cluster:
+                with alignment.subgraph(name='cluster_' + first_block + second_block) as cluster:
                     cluster.edge(first_block, second_block)
 
+    def get_diagram(self, fit=True, outputs='default', image_path=None):
+        """
+        Creates a png diagram for the pipeline, showing Pipeline Steps,
+        Pipeline Inputs and Outputs, and block inputs and outputs.
+
+        If strings are given, they can either be one of the named outputs that have
+        been specified on the pipeline definition or a full variable specification
+        following the format ``{block-name}.{variable-name}``.
+
+        Args:
+            fit (bool):
+                Optional argument to include fit arguments or not. Defaults to `True`.
+
+            outputs (str, int, or list[str or int]):
+                Single or list of output specifications.
+
+            image_path (str):
+                Optional argument for the location at which to save the file.
+                Defaults to `None`, which returns a `graphviz.Digraph` object instead of
+                saving the file.
+
+        Returns:
+            None or `graphviz.Digraph` object:
+                * `graphviz.Digraph` contains the information about the Pipeline Diagram
+        """
+
+        diagram = Digraph(format='png')
+        diagram.attr('graph', splines='ortho')
+
+        self._make_diagram_blocks(diagram)
+        variables = self._make_diagram_inputs(diagram, fit)
+        output_variables = self._make_diagram_outputs(diagram, outputs)
+        cluster_edges = self._make_diagram_variables(diagram, fit, variables)
+        self._make_diagram_output_connections(diagram, variables, output_variables)
+        self._make_diagram_alignment(diagram, cluster_edges)
+
         if image_path:
             diagram.render(filename='Diagram', directory=image_path, cleanup=True, format='png')
         else:
             return diagram
 
-
-
     def save(self, path):
         """Save the specification of this MLPipeline in a JSON file.
 

From d47e339496b72fb13472fa68e4044f978a9cf0a4 Mon Sep 17 00:00:00 2001
From: Erica Chiu <ejchiu@mit.edu>
Date: Tue, 26 May 2020 10:15:32 -0700
Subject: [PATCH 101/160] Fix import order

---
 mlblocks/mlpipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 128932f6..a96995be 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -8,9 +8,9 @@
 import warnings
 from collections import Counter, OrderedDict, defaultdict
 from copy import deepcopy
-from graphviz import Digraph
 
 import numpy as np
+from graphviz import Digraph
 
 from mlblocks.discovery import load_pipeline
 from mlblocks.mlblock import MLBlock

From 62c310e72b10779544d993bb172be708e8095ccd Mon Sep 17 00:00:00 2001
From: Erica Chiu <ejchiu@mit.edu>
Date: Tue, 26 May 2020 11:36:11 -0700
Subject: [PATCH 102/160] Fix double arrow bug

---
 mlblocks/mlpipeline.py | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index a96995be..48d68268 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -864,6 +864,7 @@ def _get_simple_block_name(self, block_name):
         Args:
             block_name (str):
                 Name of the block whose simple name is being extracted.
+
         Returns:
             str:
                 block name stripped of number and other modifiers.
@@ -879,6 +880,7 @@ def _get_context_name_from_variable(self, variable_name):
         Args:
             variable_name (str):
                 Name of the variable.
+
         Returns:
             str:
                 Name of the context of the variable.
@@ -926,7 +928,7 @@ def _make_diagram_inputs(self, diagram, fit):
 
         with diagram.subgraph(name="cluster_inputs") as cluster:
             cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
-            cluster.attr('node', penwidth='0', fontsize='24')
+            cluster.attr('node', penwidth='0', fontsize='20')
             cluster.attr('edge', penwidth='0', arrowhead='none')
             cluster.node('Input', 'Input', fontsize='14')
             for input_name in inputs:
@@ -966,7 +968,7 @@ def _make_diagram_outputs(self, diagram, outputs):
 
         with diagram.subgraph(name="cluster_outputs") as cluster:
             cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
-            cluster.attr('node', penwidth='0', fontsize='24')
+            cluster.attr('node', penwidth='0', fontsize='20')
             cluster.attr('edge', penwidth='0', arrowhead='none')
             cluster.node('Output', 'Output', fontsize='14')
             for output in outputs_vars:
@@ -1048,23 +1050,23 @@ def _make_diagram_variables_input_block(self, diagram, fit, variables, cluster_e
                 in order
         """
         input_names = self.input_names.get(block_name, dict())
-        input_variables = block.produce_args
+        input_variables = set(variable['name'] for variable in block.produce_args)
 
         if fit:
             for input_variable in block.fit_args:
-                if input_variable not in input_variables:
-                    input_variables.append(input_variable)
+                if input_variable['name'] not in input_variables:
+                    input_variables.add(input_variable['name'])
+
         for input_variable in input_variables:
-            input_variable_name = input_variable['name']
-            if input_variable_name in input_names:
-                diagram.node(block_name + ' ' + input_variable_name,
-                             '(' + input_variable_name + ')', fontcolor='blue')
-                original_variable_name = input_names[input_variable_name]
+            if input_variable in input_names:
+                diagram.node(block_name + ' ' + input_variable,
+                             '(' + input_variable + ')', fontcolor='blue')
+                original_variable_name = input_names[input_variable]
                 diagram.edge(variables[original_variable_name],
-                             block_name + ' ' + input_variable_name)
-                cluster_edges.add((block_name + ' ' + input_variable_name, block_name))
+                             block_name + ' ' + input_variable)
+                cluster_edges.add((block_name + ' ' + input_variable, block_name))
             else:
-                diagram.edge(variables[input_variable_name], block_name)
+                diagram.edge(variables[input_variable], block_name)
 
     def _make_diagram_variables_output_block(self, diagram, variables, cluster_edges, block,
                                              block_name):
@@ -1145,10 +1147,6 @@ def _make_diagram_alignment(self, diagram, cluster_edges):
         """
         with diagram.subgraph() as alignment:
             alignment.attr('graph', penwidth='0')
-            alignment.attr('edge', penwidth='0', arrowhead='none')
-            for index in range(1, len(self.blocks)):
-                alignment.edge(self._get_block_name(index - 1), self._get_block_name(index))
-
             alignment.attr('edge', len='1', minlen='1', penwidth='1')
             for first_block, second_block in cluster_edges:
                 with alignment.subgraph(name='cluster_' + first_block + second_block) as cluster:

From a3394b55aa505771a229676991541ea66cc4c226 Mon Sep 17 00:00:00 2001
From: Erica Chiu <ejchiu@mit.edu>
Date: Tue, 26 May 2020 17:06:01 -0700
Subject: [PATCH 103/160] Edit tooltips

---
 mlblocks/mlpipeline.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 48d68268..81c3fc19 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -927,10 +927,11 @@ def _make_diagram_inputs(self, diagram, fit):
         inputs = self.get_inputs(fit)
 
         with diagram.subgraph(name="cluster_inputs") as cluster:
+            cluster.attr(tooltip='Input variables')
             cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
             cluster.attr('node', penwidth='0', fontsize='20')
             cluster.attr('edge', penwidth='0', arrowhead='none')
-            cluster.node('Input', 'Input', fontsize='14')
+            cluster.node('Input', 'Input', fontsize='14', tooltip='Input variables')
             for input_name in inputs:
                 variables[input_name] = input_name + '_input'
                 input_variables.append(input_name)
@@ -967,10 +968,11 @@ def _make_diagram_outputs(self, diagram, outputs):
         outputs_vars = self.get_outputs(outputs)
 
         with diagram.subgraph(name="cluster_outputs") as cluster:
+            cluster.attr(tooltip='Output variables')
             cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
             cluster.attr('node', penwidth='0', fontsize='20')
             cluster.attr('edge', penwidth='0', arrowhead='none')
-            cluster.node('Output', 'Output', fontsize='14')
+            cluster.node('Output', 'Output', fontsize='14', tooltip='Output variables')
             for output in outputs_vars:
                 try:
                     variable_name = self._get_context_name_from_variable(output['variable'])
@@ -1180,6 +1182,8 @@ def get_diagram(self, fit=True, outputs='default', image_path=None):
 
         diagram = Digraph(format='png')
         diagram.attr('graph', splines='ortho')
+        diagram.attr(tooltip=' ')  # hack to remove extraneous tooltips on edges
+        diagram.attr('edge', tooltip=' ')
 
         self._make_diagram_blocks(diagram)
         variables = self._make_diagram_inputs(diagram, fit)

From aabab78e56b60216330ec460b8c3a3dcc040aa30 Mon Sep 17 00:00:00 2001
From: Erica Chiu <ejchiu@mit.edu>
Date: Wed, 27 May 2020 14:08:46 -0700
Subject: [PATCH 104/160] Fix bug with repetitive variable node names

---
 mlblocks/mlpipeline.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 81c3fc19..af00e34b 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -1061,12 +1061,13 @@ def _make_diagram_variables_input_block(self, diagram, fit, variables, cluster_e
 
         for input_variable in input_variables:
             if input_variable in input_names:
-                diagram.node(block_name + ' ' + input_variable,
+                input_variable_label = block_name + ' ' + input_variable + ' (input)'
+                diagram.node(input_variable_label,
                              '(' + input_variable + ')', fontcolor='blue')
                 original_variable_name = input_names[input_variable]
                 diagram.edge(variables[original_variable_name],
-                             block_name + ' ' + input_variable)
-                cluster_edges.add((block_name + ' ' + input_variable, block_name))
+                             input_variable_label)
+                cluster_edges.add((input_variable_label, block_name))
             else:
                 diagram.edge(variables[input_variable], block_name)
 
@@ -1104,18 +1105,20 @@ def _make_diagram_variables_output_block(self, diagram, variables, cluster_edges
         for output_variable in block.produce_output:
             output_variable_name = output_variable['name']
             if output_variable_name in output_names:
-                diagram.node(block_name + ' ' + output_variable_name,
+                output_variable_label = block_name + ' ' + output_variable_name + ' (output)'
+                diagram.node(output_variable_label,
                              '(' + output_variable_name + ')', fontcolor='red')
-                cluster_edges.add((block_name, block_name + ' ' + output_variable_name))
+                cluster_edges.add((block_name, output_variable_label))
                 new_variable_name = output_names[output_variable_name]
                 diagram.node(block_name + ' ' + new_variable_name, new_variable_name)
-                diagram.edge(block_name + ' ' + output_variable_name,
+                diagram.edge(output_variable_label,
                              block_name + ' ' + new_variable_name, arrowhead='none')
                 variables[new_variable_name] = block_name + ' ' + new_variable_name
             else:
-                diagram.node(block_name + ' ' + output_variable_name, output_variable_name)
-                diagram.edge(block_name, block_name + ' ' + output_variable_name, arrowhead='none')
-                variables[output_variable_name] = block_name + ' ' + output_variable_name
+                output_variable_label = block_name + ' ' + output_variable_name
+                diagram.node(output_variable_label, output_variable_name)
+                diagram.edge(block_name, output_variable_label, arrowhead='none')
+                variables[output_variable_name] = output_variable_label
 
     def _make_diagram_output_connections(self, diagram, variables, output_variables):
         """

From ca2c973583942a1dbab72ec96f1013e4069d6995 Mon Sep 17 00:00:00 2001
From: Erica Chiu <ejchiu@mit.edu>
Date: Wed, 27 May 2020 22:27:44 -0700
Subject: [PATCH 105/160] Remove unnecessary nodes and edges from diagram

---
 mlblocks/mlpipeline.py | 339 ++++++++++++++++++++---------------------
 1 file changed, 162 insertions(+), 177 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index af00e34b..6d2738ba 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -893,22 +893,57 @@ def _get_context_name_from_variable(self, variable_name):
             raise ValueError("Invalid variable name")
         return context_name
 
-    def _make_diagram_blocks(self, diagram):
+    def _get_relevant_output_variables(self, block_name, block, current_output_variables):
         """
-        Modifies the diagram to add blocks of the pipeline as visible nodes in the diagram.
+        Gets the output variables of the given block that are in a given set of output variables
+
+        Args:
+            block_name (str):
+                The name of the block from which the variables are outputted
+            block (MLBlock):
+                The block from which the variables are outputted
+
+            current_output_variables (list):
+                A list of possible output variables to return
+
+        Returns:
+            set:
+                A set of strings containing the output variable name if and only if it is an
+                output variable of the given block and its name is in the list of possible
+                output variables
+        """
+        output_alt_names = self.output_names.get(block_name, dict())
+        relevant_output = set()
+        for block_output in block.produce_output:
+            output_variable_name = block_output['name']
+            if output_variable_name in output_alt_names.keys():
+                output_variable_name = output_alt_names[output_variable_name]
+
+            if output_variable_name in current_output_variables:
+                relevant_output.add(block_output['name'])
+
+        return relevant_output
+
+    def _make_diagram_block(self, diagram, block_name):
+        """
+        Modifies the diagram to add the corresponding block of the pipeline as a visible node in
+        the diagram.
 
         Args:
             diagram (Digraph):
                 Diagram to be modified.
+
+            block_name (str):
+                Name of block to be added to the diagram
         """
-        diagram.attr('node', shape='box', penwidth='1')
-        for block_name in self.blocks.keys():
-            simple_name = self._get_simple_block_name(block_name)
-            diagram.node(block_name, simple_name)
+        simple_name = self._get_simple_block_name(block_name)
+        diagram.node(block_name, simple_name, penwidth='1')
 
-    def _make_diagram_inputs(self, diagram, fit):
+    def _make_block_inputs(self, diagram, fit, block_name, block, cluster_edges, variable_blocks):
         """
-        Modifies the diagram to add the inputs of the pipeline
+        Modifies the diagram to add the corresponding input variables to the corresponding block
+        and their edges as outputs to other blocks by modifying `variable_blocks`. Additionally
+        modifies a set of edges to add any edges between an alternative input name and this block.
 
         Args:
             diagram (Digraph):
@@ -917,37 +952,120 @@ def _make_diagram_inputs(self, diagram, fit):
             fit (bool):
                 `True` if including fitted arguments, `False` otherwise.
 
-        Returns:
-            dict:
-                Dictionary of variables mapped to their label for their node in the pipeline.
+            block_name (str):
+                Name of block whose input variables are to be added to the diagram
+
+            block (MLBlock):
+                Block whose input variables are to be added to the diagram
+
+            cluster_edges (set):
+                Set of edges between alternative variable names and their corresponding block
+
+            variable_blocks (dict):
+                Dictionary of variable names and the set of blocks into which the variable connects
         """
-        diagram.attr('node', shape='box')
-        variables = {}
-        input_variables = []
-        inputs = self.get_inputs(fit)
+        input_alt_names = self.input_names.get(block_name, dict())
+        input_variables = set(variable['name'] for variable in block.produce_args)
+
+        if fit:
+            for input_variable in block.fit_args:
+                if input_variable['name'] not in input_variables:
+                    input_variables.add(input_variable['name'])
 
+        for input_name in input_variables:
+            input_block = block_name
+            if input_name in input_alt_names:
+                input_variable_label = block_name + ' ' + input_name + ' (input)'
+                diagram.node(input_variable_label,
+                             '(' + input_name + ')', fontcolor='blue')
+                cluster_edges.add((input_variable_label, block_name))
+                input_name = input_alt_names[input_name]
+                input_block = input_variable_label
+
+            if input_name in variable_blocks.keys():
+                variable_blocks[input_name].add(input_block)
+            else:
+                variable_blocks[input_name] = {input_block}
+
+    def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges,
+                            variable_blocks):
+        """
+        Modifies the diagram to add the corresponding output variables to the corresponding block
+        and their edges as inputs to other blocks, as well as updating `variable_blocks`.
+        Additionally modifies a set of edges to add any edges between an alternative output name
+        and this block.
+
+        Args:
+            diagram (Digraph):
+                Diagram to be modified.
+
+            block_name (str):
+                Name of block whose output variables are to be added to the diagram
+
+            output_names (set):
+                Set of output variable names to be added to the diagram
+
+            cluster_edges (set):
+                Set of edges between alternative variable names and their corresponding block
+
+            variable_blocks (dict):
+                Dictionary of variable names and the set of blocks into which the variable connects
+        """
+        output_alt_names = self.output_names.get(block_name, dict())
+        for output_name in output_names:
+            output_block = block_name
+            if output_name in output_alt_names.keys():
+                alt_variable_label = block_name + ' ' + output_name + ' (output)'
+                diagram.node(alt_variable_label,
+                             '(' + output_name + ')', fontcolor='red')
+                cluster_edges.add((block_name, alt_variable_label))
+                output_name = output_alt_names[output_name]
+                output_block = alt_variable_label
+
+            output_variable_label = block_name + ' ' + output_name
+            diagram.node(output_variable_label, output_name)
+            diagram.edge(output_block, output_variable_label, arrowhead='none')
+
+            for block in variable_blocks[output_name]:
+                diagram.edge(output_variable_label, block)
+
+            del variable_blocks[output_name]
+
+    def _make_diagram_inputs(self, diagram, input_variables_blocks):
+        """
+        Modifies the diagram to add the inputs of the pipeline
+
+        Args:
+            diagram (Digraph):
+                Diagram to be modified.
+
+            input_variables_blocks (dict):
+                Dictionary of input variables of the pipeline and the set of blocks where the
+                corresponding variable is an input
+        """
         with diagram.subgraph(name="cluster_inputs") as cluster:
             cluster.attr(tooltip='Input variables')
             cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
             cluster.attr('node', penwidth='0', fontsize='20')
             cluster.attr('edge', penwidth='0', arrowhead='none')
             cluster.node('Input', 'Input', fontsize='14', tooltip='Input variables')
-            for input_name in inputs:
-                variables[input_name] = input_name + '_input'
-                input_variables.append(input_name)
-                cluster.node(variables[input_name], input_name)
-                cluster.edge('Input', variables[input_name])
+            input_variables = []
+            for input_name, blocks in input_variables_blocks.items():
+                input_name_label = input_name + '_input'
+                cluster.node(input_name_label, input_name)
+                cluster.edge('Input', input_name_label)
+                input_variables.append(input_name_label)
+
+                for block in blocks:
+                    diagram.edge(input_name_label, block, pendwith='1')
 
             with cluster.subgraph() as input_variables_subgraph:
                 input_variables_subgraph.attr(None, rank='same')
                 for index in range(1, len(input_variables)):
-                    input_variables_subgraph.edge(
-                        variables[input_variables[index - 1]],
-                        variables[input_variables[index]])
+                    input_variables_subgraph.edge(input_variables[index - 1],
+                                                  input_variables[index])
                     input_variables_subgraph.attr(None, rankdir='LR')
 
-        return variables
-
     def _make_diagram_outputs(self, diagram, outputs):
         """
         Modifies the diagram to add outputs of the pipeline in order from left to right.
@@ -963,7 +1081,6 @@ def _make_diagram_outputs(self, diagram, outputs):
             list[str]:
                 List of the human-readable names of the output variables in order
         """
-        diagram.attr('node', shape='box')
         output_variables = []
         outputs_vars = self.get_outputs(outputs)
 
@@ -991,153 +1108,6 @@ def _make_diagram_outputs(self, diagram, outputs):
 
         return output_variables
 
-    def _make_diagram_variables(self, diagram, fit, variables):
-        """
-        Modifies the diagram to add main variables of the pipeline.
-
-        Args:
-            diagram (Digraph):
-                Diagram to be modified
-
-            fit (bool):
-                `True` if including fitted arguments, `False` otherwise.
-
-            variables (dict):
-                Dictionary of variables mapped to their label for their node in the pipeline.
-
-        Returns:
-            set:
-                Set of tuples of the alternative variable name and its corresponding block
-                in order
-        """
-        diagram.attr('node', fontsize='14', penwidth='0')
-        diagram.attr('edge', penwidth='1')
-        cluster_edges = set()
-
-        for block_name, block in self.blocks.items():
-            self._make_diagram_variables_input_block(diagram, fit, variables, cluster_edges, block,
-                                                     block_name)
-            self._make_diagram_variables_output_block(diagram, variables, cluster_edges, block,
-                                                      block_name)
-        return cluster_edges
-
-    def _make_diagram_variables_input_block(self, diagram, fit, variables, cluster_edges, block,
-                                            block_name):
-        """
-        Modifies the diagram to add input variables the corresponding block of the pipeline.
-
-        Args:
-            diagram (Digraph):
-                Diagram to be modified
-
-            fit (bool):
-                `True` if including fitted arguments, `False` otherwise.
-
-            variables (dict):
-                Dictionary of variables mapped to their label for their node in the pipeline.
-
-            cluster_edges (set):
-                Set of tuples that may contain some alternative variable names and its
-                corresponding block in order
-
-            block (MLBlock):
-                The block to add its input variables to the diagram
-
-            block_name (str):
-                The name of the block to add its input variables to the diagram
-
-        Returns:
-            set:
-                Set of tuples of the alternative variable name and its corresponding block
-                in order
-        """
-        input_names = self.input_names.get(block_name, dict())
-        input_variables = set(variable['name'] for variable in block.produce_args)
-
-        if fit:
-            for input_variable in block.fit_args:
-                if input_variable['name'] not in input_variables:
-                    input_variables.add(input_variable['name'])
-
-        for input_variable in input_variables:
-            if input_variable in input_names:
-                input_variable_label = block_name + ' ' + input_variable + ' (input)'
-                diagram.node(input_variable_label,
-                             '(' + input_variable + ')', fontcolor='blue')
-                original_variable_name = input_names[input_variable]
-                diagram.edge(variables[original_variable_name],
-                             input_variable_label)
-                cluster_edges.add((input_variable_label, block_name))
-            else:
-                diagram.edge(variables[input_variable], block_name)
-
-    def _make_diagram_variables_output_block(self, diagram, variables, cluster_edges, block,
-                                             block_name):
-        """
-        Modifies the diagram to add output variables the corresponding block of the pipeline.
-
-        Args:
-            diagram (Digraph):
-                Diagram to be modified
-
-            fit (bool):
-                `True` if including fitted arguments, `False` otherwise.
-
-            variables (dict):
-                Dictionary of variables mapped to their label for their node in the pipeline.
-
-            cluster_edges (set):
-                Set of tuples that may contain some alternative variable names and its
-                corresponding block in order
-
-            block (MLBlock):
-                The block to add its output variables to the diagram
-
-            block_name (str):
-                The name of the block to add its output variables to the diagram
-
-        Returns:
-            set:
-                Set of tuples of the alternative variable name and its corresponding block
-                in order
-        """
-        output_names = self.output_names.get(block_name, dict())
-        for output_variable in block.produce_output:
-            output_variable_name = output_variable['name']
-            if output_variable_name in output_names:
-                output_variable_label = block_name + ' ' + output_variable_name + ' (output)'
-                diagram.node(output_variable_label,
-                             '(' + output_variable_name + ')', fontcolor='red')
-                cluster_edges.add((block_name, output_variable_label))
-                new_variable_name = output_names[output_variable_name]
-                diagram.node(block_name + ' ' + new_variable_name, new_variable_name)
-                diagram.edge(output_variable_label,
-                             block_name + ' ' + new_variable_name, arrowhead='none')
-                variables[new_variable_name] = block_name + ' ' + new_variable_name
-            else:
-                output_variable_label = block_name + ' ' + output_variable_name
-                diagram.node(output_variable_label, output_variable_name)
-                diagram.edge(block_name, output_variable_label, arrowhead='none')
-                variables[output_variable_name] = output_variable_label
-
-    def _make_diagram_output_connections(self, diagram, variables, output_variables):
-        """
-        Modifies the diagram to add connections to the output variables of the pipeline.
-
-        Args:
-            diagram (Digraph):
-                Diagram to be modified
-
-            variables (dict):
-                Dictionary of variables mapped to their label for their node in the pipeline.
-
-            output_variables (list[str]):
-                List of the human-readable names of the output variables in order
-        """
-        for output_variable in output_variables:
-            variable_block = variables[output_variable]
-            diagram.edge(variable_block, output_variable + '_output')
-
     def _make_diagram_alignment(self, diagram, cluster_edges):
         """
         Modifies the diagram to add alignment edges and connect alternative names to the blocks.
@@ -1152,7 +1122,9 @@ def _make_diagram_alignment(self, diagram, cluster_edges):
         """
         with diagram.subgraph() as alignment:
             alignment.attr('graph', penwidth='0')
+            alignment.attr('node', penwidth='0')
             alignment.attr('edge', len='1', minlen='1', penwidth='1')
+
             for first_block, second_block in cluster_edges:
                 with alignment.subgraph(name='cluster_' + first_block + second_block) as cluster:
                     cluster.edge(first_block, second_block)
@@ -1187,12 +1159,25 @@ def get_diagram(self, fit=True, outputs='default', image_path=None):
         diagram.attr('graph', splines='ortho')
         diagram.attr(tooltip=' ')  # hack to remove extraneous tooltips on edges
         diagram.attr('edge', tooltip=' ')
+        diagram.attr('node', shape='box', penwidth='0')
 
-        self._make_diagram_blocks(diagram)
-        variables = self._make_diagram_inputs(diagram, fit)
         output_variables = self._make_diagram_outputs(diagram, outputs)
-        cluster_edges = self._make_diagram_variables(diagram, fit, variables)
-        self._make_diagram_output_connections(diagram, variables, output_variables)
+
+        cluster_edges = set()
+        variable_blocks = dict((name, {name + '_output'}) for name in output_variables)
+        for block_name, block in reversed(self.blocks.items()):
+            relevant_output_names = self._get_relevant_output_variables(block_name, block,
+                                                                        variable_blocks.keys())
+            if len(relevant_output_names) == 0:
+                continue  # skip this block
+
+            self._make_diagram_block(diagram, block_name)
+            self._make_block_outputs(diagram, block_name, relevant_output_names, cluster_edges,
+                                     variable_blocks)
+            self._make_block_inputs(diagram, fit, block_name, block, cluster_edges,
+                                    variable_blocks)
+
+        self._make_diagram_inputs(diagram, variable_blocks)
         self._make_diagram_alignment(diagram, cluster_edges)
 
         if image_path:

From 8deb6d64324842656f98968f79ae13d0e7c3c8b9 Mon Sep 17 00:00:00 2001
From: Erica Chiu <ejchiu@mit.edu>
Date: Thu, 4 Jun 2020 16:56:26 -0700
Subject: [PATCH 106/160] Remove intermediate arrowheads

---
 mlblocks/mlpipeline.py | 42 ++++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 6d2738ba..2465ea5f 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -900,6 +900,7 @@ def _get_relevant_output_variables(self, block_name, block, current_output_varia
         Args:
             block_name (str):
                 The name of the block from which the variables are outputted
+
             block (MLBlock):
                 The block from which the variables are outputted
 
@@ -959,10 +960,12 @@ def _make_block_inputs(self, diagram, fit, block_name, block, cluster_edges, var
                 Block whose input variables are to be added to the diagram
 
             cluster_edges (set):
-                Set of edges between alternative variable names and their corresponding block
+                Set of tuples representing edges between alternative variable names and their
+                corresponding block and the type of arrowhead
 
             variable_blocks (dict):
-                Dictionary of variable names and the set of blocks into which the variable connects
+                Dictionary of variable names and the set of tuples of blocks into which the
+                variable connects and the type of arrowhead to use
         """
         input_alt_names = self.input_names.get(block_name, dict())
         input_variables = set(variable['name'] for variable in block.produce_args)
@@ -974,18 +977,20 @@ def _make_block_inputs(self, diagram, fit, block_name, block, cluster_edges, var
 
         for input_name in input_variables:
             input_block = block_name
+            arrowhead = 'normal'
             if input_name in input_alt_names:
                 input_variable_label = block_name + ' ' + input_name + ' (input)'
                 diagram.node(input_variable_label,
                              '(' + input_name + ')', fontcolor='blue')
-                cluster_edges.add((input_variable_label, block_name))
+                cluster_edges.add((input_variable_label, block_name, 'normal'))
                 input_name = input_alt_names[input_name]
                 input_block = input_variable_label
+                arrowhead = 'none'
 
             if input_name in variable_blocks.keys():
-                variable_blocks[input_name].add(input_block)
+                variable_blocks[input_name].add((input_block, arrowhead))
             else:
-                variable_blocks[input_name] = {input_block}
+                variable_blocks[input_name] = {(input_block, arrowhead)}
 
     def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges,
                             variable_blocks):
@@ -1006,10 +1011,12 @@ def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges,
                 Set of output variable names to be added to the diagram
 
             cluster_edges (set):
-                Set of edges between alternative variable names and their corresponding block
+                Set of tuples representing edges between alternative variable names and their
+                corresponding block and the type of arrowhead
 
             variable_blocks (dict):
-                Dictionary of variable names and the set of blocks into which the variable connects
+                Dictionary of variable names and the set of tuples of blocks into which the
+                variable connects and the type of arrowhead to use
         """
         output_alt_names = self.output_names.get(block_name, dict())
         for output_name in output_names:
@@ -1018,7 +1025,7 @@ def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges,
                 alt_variable_label = block_name + ' ' + output_name + ' (output)'
                 diagram.node(alt_variable_label,
                              '(' + output_name + ')', fontcolor='red')
-                cluster_edges.add((block_name, alt_variable_label))
+                cluster_edges.add((block_name, alt_variable_label, 'none'))
                 output_name = output_alt_names[output_name]
                 output_block = alt_variable_label
 
@@ -1026,8 +1033,8 @@ def _make_block_outputs(self, diagram, block_name, output_names, cluster_edges,
             diagram.node(output_variable_label, output_name)
             diagram.edge(output_block, output_variable_label, arrowhead='none')
 
-            for block in variable_blocks[output_name]:
-                diagram.edge(output_variable_label, block)
+            for block, arrow in variable_blocks[output_name]:
+                diagram.edge(output_variable_label, block, arrowhead=arrow)
 
             del variable_blocks[output_name]
 
@@ -1040,8 +1047,8 @@ def _make_diagram_inputs(self, diagram, input_variables_blocks):
                 Diagram to be modified.
 
             input_variables_blocks (dict):
-                Dictionary of input variables of the pipeline and the set of blocks where the
-                corresponding variable is an input
+                Dictionary of input variables of the pipeline and the set of tuples of blocks into
+                which the variable connects and the type of arrowhead to use
         """
         with diagram.subgraph(name="cluster_inputs") as cluster:
             cluster.attr(tooltip='Input variables')
@@ -1056,8 +1063,8 @@ def _make_diagram_inputs(self, diagram, input_variables_blocks):
                 cluster.edge('Input', input_name_label)
                 input_variables.append(input_name_label)
 
-                for block in blocks:
-                    diagram.edge(input_name_label, block, pendwith='1')
+                for block, arrow in blocks:
+                    diagram.edge(input_name_label, block, pendwith='1', arrowhead=arrow)
 
             with cluster.subgraph() as input_variables_subgraph:
                 input_variables_subgraph.attr(None, rank='same')
@@ -1125,9 +1132,9 @@ def _make_diagram_alignment(self, diagram, cluster_edges):
             alignment.attr('node', penwidth='0')
             alignment.attr('edge', len='1', minlen='1', penwidth='1')
 
-            for first_block, second_block in cluster_edges:
+            for first_block, second_block, arrow in cluster_edges:
                 with alignment.subgraph(name='cluster_' + first_block + second_block) as cluster:
-                    cluster.edge(first_block, second_block)
+                    cluster.edge(first_block, second_block, arrowhead=arrow)
 
     def get_diagram(self, fit=True, outputs='default', image_path=None):
         """
@@ -1158,13 +1165,12 @@ def get_diagram(self, fit=True, outputs='default', image_path=None):
         diagram = Digraph(format='png')
         diagram.attr('graph', splines='ortho')
         diagram.attr(tooltip=' ')  # hack to remove extraneous tooltips on edges
-        diagram.attr('edge', tooltip=' ')
         diagram.attr('node', shape='box', penwidth='0')
 
         output_variables = self._make_diagram_outputs(diagram, outputs)
 
         cluster_edges = set()
-        variable_blocks = dict((name, {name + '_output'}) for name in output_variables)
+        variable_blocks = dict((name, {(name + '_output', 'normal')}) for name in output_variables)
         for block_name, block in reversed(self.blocks.items()):
             relevant_output_names = self._get_relevant_output_variables(block_name, block,
                                                                         variable_blocks.keys())

From ea8bb9a25e12ee13b29f985874ef15f1f032e690 Mon Sep 17 00:00:00 2001
From: Erica Chiu <ejchiu@mit.edu>
Date: Thu, 4 Jun 2020 17:55:04 -0700
Subject: [PATCH 107/160] Add diagram tests

---
 tests/data/diagrams/diagram_fit.txt           | 40 +++++++++
 .../data/diagrams/diagram_multiple_blocks.txt | 44 +++++++++
 tests/data/diagrams/diagram_simple.txt        | 40 +++++++++
 tests/test_mlpipeline.py                      | 90 +++++++++++++++++++
 4 files changed, 214 insertions(+)
 create mode 100644 tests/data/diagrams/diagram_fit.txt
 create mode 100644 tests/data/diagrams/diagram_multiple_blocks.txt
 create mode 100644 tests/data/diagrams/diagram_simple.txt

diff --git a/tests/data/diagrams/diagram_fit.txt b/tests/data/diagrams/diagram_fit.txt
new file mode 100644
index 00000000..7939b5e3
--- /dev/null
+++ b/tests/data/diagrams/diagram_fit.txt
@@ -0,0 +1,40 @@
+digraph {
+	graph [splines=ortho]
+	tooltip=" "
+	node [penwidth=0 shape=box]
+	subgraph cluster_outputs {
+		tooltip="Output variables"
+		graph [bgcolor=azure3 penwidth=0 rank=source]
+		node [fontsize=20 penwidth=0]
+		edge [arrowhead=none penwidth=0]
+		Output [label=Output fontsize=14 tooltip="Output variables"]
+		output_variable_output [label=output_variable]
+		output_variable_output -> Output
+		{
+			rank=same
+			rankdir=LR
+		}
+	}
+	"a_primitive#1" [label=a_primitive penwidth=1]
+	"a_primitive#1 output_variable" [label=output_variable]
+	"a_primitive#1" -> "a_primitive#1 output_variable" [arrowhead=none]
+	"a_primitive#1 output_variable" -> output_variable_output [arrowhead=normal]
+	input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1]
+	subgraph cluster_inputs {
+		tooltip="Input variables"
+		graph [bgcolor=azure3 penwidth=0 rank=source]
+		node [fontsize=20 penwidth=0]
+		edge [arrowhead=none penwidth=0]
+		Input [label=Input fontsize=14 tooltip="Input variables"]
+		input_variable_input [label=input_variable]
+		Input -> input_variable_input
+		{
+			rank=same
+		}
+	}
+	{
+		graph [penwidth=0]
+		node [penwidth=0]
+		edge [len=1 minlen=1 penwidth=1]
+	}
+}
diff --git a/tests/data/diagrams/diagram_multiple_blocks.txt b/tests/data/diagrams/diagram_multiple_blocks.txt
new file mode 100644
index 00000000..3f43a108
--- /dev/null
+++ b/tests/data/diagrams/diagram_multiple_blocks.txt
@@ -0,0 +1,44 @@
+digraph {
+	graph [splines=ortho]
+	tooltip=" "
+	node [penwidth=0 shape=box]
+	subgraph cluster_outputs {
+		tooltip="Output variables"
+		graph [bgcolor=azure3 penwidth=0 rank=source]
+		node [fontsize=20 penwidth=0]
+		edge [arrowhead=none penwidth=0]
+		Output [label=Output fontsize=14 tooltip="Output variables"]
+		output_variable_b_output [label=output_variable_b]
+		output_variable_b_output -> Output
+		{
+			rank=same
+			rankdir=LR
+		}
+	}
+	"b_primitive#1" [label=b_primitive penwidth=1]
+	"b_primitive#1 output_variable_b" [label=output_variable_b]
+	"b_primitive#1" -> "b_primitive#1 output_variable_b" [arrowhead=none]
+	"b_primitive#1 output_variable_b" -> output_variable_b_output [arrowhead=normal]
+	"a_primitive#1" [label=a_primitive penwidth=1]
+	"a_primitive#1 output_variable_a" [label=output_variable_a]
+	"a_primitive#1" -> "a_primitive#1 output_variable_a" [arrowhead=none]
+	"a_primitive#1 output_variable_a" -> "b_primitive#1" [arrowhead=normal]
+	input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1]
+	subgraph cluster_inputs {
+		tooltip="Input variables"
+		graph [bgcolor=azure3 penwidth=0 rank=source]
+		node [fontsize=20 penwidth=0]
+		edge [arrowhead=none penwidth=0]
+		Input [label=Input fontsize=14 tooltip="Input variables"]
+		input_variable_input [label=input_variable]
+		Input -> input_variable_input
+		{
+			rank=same
+		}
+	}
+	{
+		graph [penwidth=0]
+		node [penwidth=0]
+		edge [len=1 minlen=1 penwidth=1]
+	}
+}
diff --git a/tests/data/diagrams/diagram_simple.txt b/tests/data/diagrams/diagram_simple.txt
new file mode 100644
index 00000000..7939b5e3
--- /dev/null
+++ b/tests/data/diagrams/diagram_simple.txt
@@ -0,0 +1,40 @@
+digraph {
+	graph [splines=ortho]
+	tooltip=" "
+	node [penwidth=0 shape=box]
+	subgraph cluster_outputs {
+		tooltip="Output variables"
+		graph [bgcolor=azure3 penwidth=0 rank=source]
+		node [fontsize=20 penwidth=0]
+		edge [arrowhead=none penwidth=0]
+		Output [label=Output fontsize=14 tooltip="Output variables"]
+		output_variable_output [label=output_variable]
+		output_variable_output -> Output
+		{
+			rank=same
+			rankdir=LR
+		}
+	}
+	"a_primitive#1" [label=a_primitive penwidth=1]
+	"a_primitive#1 output_variable" [label=output_variable]
+	"a_primitive#1" -> "a_primitive#1 output_variable" [arrowhead=none]
+	"a_primitive#1 output_variable" -> output_variable_output [arrowhead=normal]
+	input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1]
+	subgraph cluster_inputs {
+		tooltip="Input variables"
+		graph [bgcolor=azure3 penwidth=0 rank=source]
+		node [fontsize=20 penwidth=0]
+		edge [arrowhead=none penwidth=0]
+		Input [label=Input fontsize=14 tooltip="Input variables"]
+		input_variable_input [label=input_variable]
+		Input -> input_variable_input
+		{
+			rank=same
+		}
+	}
+	{
+		graph [penwidth=0]
+		node [penwidth=0]
+		edge [len=1 minlen=1 penwidth=1]
+	}
+}
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 340a3838..9d649ad1 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -681,6 +681,96 @@ def test_get_inputs_no_fit(self):
 
         assert inputs == expected
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_get_diagram_simple(self):
+        f = open('tests/data/diagrams/diagram_simple.txt', 'r')
+        expected = f.read()[:-1]
+        f.close()
+
+        output = [
+            {
+                'name': 'output_variable',
+                'type': 'another_whatever',
+                'variable': 'a_primitive#1.output_variable'
+            }
+        ]
+
+        pipeline = MLPipeline(['a_primitive'], outputs={'default': output})
+        pipeline.blocks['a_primitive#1'].produce_args = [
+            {
+                'name': 'input_variable',
+                'type': 'whatever'
+            }
+        ]
+        pipeline.blocks['a_primitive#1'].produce_output = output
+
+        assert str(pipeline.get_diagram()) == expected
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_get_diagram_fit(self):
+        f = open('tests/data/diagrams/diagram_fit.txt', 'r')
+        expected = f.read()[:-1]
+        f.close()
+
+        output = [
+            {
+                'name': 'output_variable',
+                'type': 'another_whatever',
+                'variable': 'a_primitive#1.output_variable'
+            }
+        ]
+
+        pipeline = MLPipeline(['a_primitive'], outputs={'default': output})
+        pipeline.blocks['a_primitive#1'].produce_args = [
+            {
+                'name': 'input_variable',
+                'type': 'whatever'
+            }
+        ]
+        pipeline.blocks['a_primitive#1'].fit_args = [
+            {
+                'name': 'input_variable',
+                'type': 'whatever'
+            }
+        ]
+        pipeline.blocks['a_primitive#1'].produce_output = output
+
+        assert str(pipeline.get_diagram()) == expected
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_get_diagram_multiple_blocks(self):
+        f = open('tests/data/diagrams/diagram_multiple_blocks.txt', 'r')
+        expected = f.read()[:-1]
+        f.close()
+
+        first_output = [
+            {
+                'name': 'output_variable_a',
+                'type': 'another_whatever',
+                'variable': 'a_primitive#1.output_variable_a'
+            }
+        ]
+        second_output = [
+            {
+                'name': 'output_variable_b',
+                'type': 'another_whatever',
+                'variable': 'b_primitive#1.output_variable_b'
+            }
+        ]
+
+        pipeline = MLPipeline(['a_primitive', 'b_primitive'], outputs={'default': second_output})
+        pipeline.blocks['a_primitive#1'].produce_args = [
+            {
+                'name': 'input_variable',
+                'type': 'whatever'
+            }
+        ]
+        pipeline.blocks['a_primitive#1'].produce_output = first_output
+        pipeline.blocks['b_primitive#1'].produce_args = first_output
+        pipeline.blocks['b_primitive#1'].produce_output = second_output
+
+        assert str(pipeline.get_diagram()) == expected
+
     def test_fit(self):
         pass
 

From 73865035c6fac86321ea86368d515d4fed068dba Mon Sep 17 00:00:00 2001
From: sarahmish <sarahalnegheimish@gmail.com>
Date: Tue, 28 Jul 2020 15:52:08 +0300
Subject: [PATCH 108/160] added dictionary to record block execution time

---
 mlblocks/mlpipeline.py | 17 +++++++++++++++++
 setup.py               |  1 +
 2 files changed, 18 insertions(+)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index dcfc8a0b..6fc789d4 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -8,6 +8,7 @@
 import warnings
 from collections import Counter, OrderedDict, defaultdict
 from copy import deepcopy
+from datetime import datetime
 
 import numpy as np
 
@@ -223,6 +224,7 @@ def __init__(self, pipeline=None, primitives=None, init_params=None,
             self.set_hyperparameters(hyperparameters)
 
         self._re_block_name = re.compile(r'(^[^#]+#\d+)(\..*)?')
+        self.time = dict()
 
     def _get_str_output(self, output):
         """Get the outputs that correspond to the str specification."""
@@ -390,6 +392,18 @@ def get_output_variables(self, outputs='default'):
         outputs = self.get_outputs(outputs)
         return [output['variable'] for output in outputs]
 
+    def get_time(self):
+        """Get the execution time of each block.
+
+        If called before fitting the pipeline, it will return an empty dictionary.
+
+        Returns:
+            dict:
+                A dictionary containing the block names as keys and
+                the execution time in seconds as values.
+        """
+        return self.time.copy()
+        
     def _extract_block_name(self, variable_name):
         return self._re_block_name.search(variable_name).group(1)
 
@@ -616,7 +630,10 @@ def _fit_block(self, block, block_name, context):
         LOGGER.debug("Fitting block %s", block_name)
         try:
             fit_args = self._get_block_args(block_name, block.fit_args, context)
+            start = datetime.utcnow()
             block.fit(**fit_args)
+            elapsed = datetime.utcnow() - start
+            self.time[block_name] = elapsed.total_seconds()
         except Exception:
             if self.verbose:
                 LOGGER.exception("Exception caught fitting MLBlock %s", block_name)
diff --git a/setup.py b/setup.py
index a4fcc7a3..56ab70cd 100644
--- a/setup.py
+++ b/setup.py
@@ -15,6 +15,7 @@
 
 
 install_requires = [
+    'Keras>=2.1.6,<2.4'
 ]
 
 

From d35544ed72850f6ed4224f16e1344039b1bfb2f1 Mon Sep 17 00:00:00 2001
From: sarahmish <sarahalnegheimish@gmail.com>
Date: Tue, 28 Jul 2020 22:39:37 +0300
Subject: [PATCH 109/160] add debug argument for fit/predict

---
 mlblocks/mlpipeline.py | 90 ++++++++++++++++++++++++++++++------------
 setup.py               |  1 -
 2 files changed, 65 insertions(+), 26 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 6fc789d4..8e5d0629 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -224,7 +224,6 @@ def __init__(self, pipeline=None, primitives=None, init_params=None,
             self.set_hyperparameters(hyperparameters)
 
         self._re_block_name = re.compile(r'(^[^#]+#\d+)(\..*)?')
-        self.time = dict()
 
     def _get_str_output(self, output):
         """Get the outputs that correspond to the str specification."""
@@ -392,18 +391,6 @@ def get_output_variables(self, outputs='default'):
         outputs = self.get_outputs(outputs)
         return [output['variable'] for output in outputs]
 
-    def get_time(self):
-        """Get the execution time of each block.
-
-        If called before fitting the pipeline, it will return an empty dictionary.
-
-        Returns:
-            dict:
-                A dictionary containing the block names as keys and
-                the execution time in seconds as values.
-        """
-        return self.time.copy()
-        
     def _extract_block_name(self, variable_name):
         return self._re_block_name.search(variable_name).group(1)
 
@@ -625,7 +612,7 @@ def _update_outputs(self, variable_name, output_variables, outputs, value):
             index = output_variables.index(variable_name)
             outputs[index] = deepcopy(value)
 
-    def _fit_block(self, block, block_name, context):
+    def _fit_block(self, block, block_name, context, debug=False):
         """Get the block args from the context and fit the block."""
         LOGGER.debug("Fitting block %s", block_name)
         try:
@@ -633,14 +620,21 @@ def _fit_block(self, block, block_name, context):
             start = datetime.utcnow()
             block.fit(**fit_args)
             elapsed = datetime.utcnow() - start
-            self.time[block_name] = elapsed.total_seconds()
+
+            if debug:
+                debug_info = {
+                    "elapsed": elapsed.total_seconds(),
+                    "input": fit_args
+                }
+                return debug_info
+
         except Exception:
             if self.verbose:
                 LOGGER.exception("Exception caught fitting MLBlock %s", block_name)
 
             raise
 
-    def _produce_block(self, block, block_name, context, output_variables, outputs):
+    def _produce_block(self, block, block_name, context, output_variables, outputs, debug=False):
         """Get the block args from the context and produce the block.
 
         Afterwards, set the block outputs back into the context and update
@@ -649,7 +643,9 @@ def _produce_block(self, block, block_name, context, output_variables, outputs):
         LOGGER.debug("Producing block %s", block_name)
         try:
             produce_args = self._get_block_args(block_name, block.produce_args, context)
+            start = datetime.utcnow()
             block_outputs = block.produce(**produce_args)
+            elapsed = datetime.utcnow() - start
 
             outputs_dict = self._extract_outputs(block_name, block_outputs, block.produce_output)
             context.update(outputs_dict)
@@ -662,13 +658,21 @@ def _produce_block(self, block, block_name, context, output_variables, outputs):
                         variable_name = '{}.{}'.format(block_name, key)
                         self._update_outputs(variable_name, output_variables, outputs, value)
 
+            if debug:
+                debug_info = {
+                    "elapsed": elapsed.total_seconds(),
+                    "input": produce_args,
+                    "output": outputs_dict
+                }
+                return debug_info
+
         except Exception:
             if self.verbose:
                 LOGGER.exception("Exception caught producing MLBlock %s", block_name)
 
             raise
 
-    def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
+    def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
         """Fit the blocks of this pipeline.
 
         Sequentially call the ``fit`` and the ``produce`` methods of each block,
@@ -698,6 +702,10 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
                 If given, the execution of the pipeline will start on the specified block,
                 and all the blocks before that one will be skipped.
 
+            debug (boolean):
+                Debug mode, if True a dictionary containing the block names as keys and
+                the execution time in seconds, input, output as values is returned.
+
             **kwargs:
                 Any additional keyword arguments will be directly added
                 to the context dictionary and available for the blocks.
@@ -725,6 +733,10 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
         if isinstance(start_, int):
             start_ = self._get_block_name(start_)
 
+        debug_info = None
+        if debug:
+            debug_info = defaultdict(dict)
+
         for block_name, block in self.blocks.items():
             if start_:
                 if block_name == start_:
@@ -733,10 +745,15 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
                     LOGGER.debug("Skipping block %s fit", block_name)
                     continue
 
-            self._fit_block(block, block_name, context)
+            out = self._fit_block(block, block_name, context, debug)
+            if debug:
+                debug_info["fit"][block_name] = out
 
             if (block_name != self._last_block_name) or (block_name in output_blocks):
-                self._produce_block(block, block_name, context, output_variables, outputs)
+                out = self._produce_block(
+                    block, block_name, context, output_variables, outputs, debug)
+                if debug:
+                    debug_info["produce"][block_name] = out
 
                 # We already captured the output from this block
                 if block_name in output_blocks:
@@ -746,15 +763,23 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs):
             # outputs we are done.
             if output_variables is not None and not output_blocks:
                 if len(outputs) > 1:
-                    return tuple(outputs)
+                    result = tuple(outputs)
                 else:
-                    return outputs[0]
+                    result = outputs[0]
+
+                if debug:
+                    return result, debug_info
+
+                return result
+
+        if debug:
+            return debug_info
 
         if start_:
             # We skipped all the blocks up to the end
             raise ValueError('Unknown block name: {}'.format(start_))
 
-    def predict(self, X=None, output_='default', start_=None, **kwargs):
+    def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs):
         """Produce predictions using the blocks of this pipeline.
 
         Sequentially call the ``produce`` method of each block, capturing the
@@ -780,6 +805,10 @@ def predict(self, X=None, output_='default', start_=None, **kwargs):
                 If given, the execution of the pipeline will start on the specified block,
                 and all the blocks before that one will be skipped.
 
+            debug (boolean):
+                Debug mode, if True a dictionary containing the block names as keys and
+                the execution time in seconds, input, output as values is returned.
+
             **kwargs:
                 Any additional keyword arguments will be directly added
                 to the context dictionary and available for the blocks.
@@ -798,6 +827,10 @@ def predict(self, X=None, output_='default', start_=None, **kwargs):
         if isinstance(start_, int):
             start_ = self._get_block_name(start_)
 
+        debug_info = None
+        if debug:
+            debug_info = dict()
+
         for block_name, block in self.blocks.items():
             if start_:
                 if block_name == start_:
@@ -806,7 +839,9 @@ def predict(self, X=None, output_='default', start_=None, **kwargs):
                     LOGGER.debug("Skipping block %s produce", block_name)
                     continue
 
-            self._produce_block(block, block_name, context, output_variables, outputs)
+            out = self._produce_block(block, block_name, context, output_variables, outputs, debug)
+            if debug:
+                debug_info[block_name] = out
 
             # We already captured the output from this block
             if block_name in output_blocks:
@@ -816,9 +851,14 @@ def predict(self, X=None, output_='default', start_=None, **kwargs):
             # outputs we are done.
             if not output_blocks:
                 if len(outputs) > 1:
-                    return tuple(outputs)
+                    result = tuple(outputs)
                 else:
-                    return outputs[0]
+                    result = outputs[0]
+
+                if debug:
+                    return result, debug_info
+
+                return result
 
         if start_:
             # We skipped all the blocks up to the end
diff --git a/setup.py b/setup.py
index 56ab70cd..a4fcc7a3 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,6 @@
 
 
 install_requires = [
-    'Keras>=2.1.6,<2.4'
 ]
 
 

From f0cd86f2073e6e1c1a3efe6a0535458374bc597e Mon Sep 17 00:00:00 2001
From: sarahmish <sarahalnegheimish@gmail.com>
Date: Tue, 28 Jul 2020 23:03:53 +0300
Subject: [PATCH 110/160] update mlprimitive test version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index a4fcc7a3..85020231 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@
 tests_require = [
     'pytest>=3.4.2',
     'pytest-cov>=2.6.0',
-    'mlprimitives>=0.2,<0.3',
+    'mlprimitives>=0.2.4.dev0',
     'setuptools>=41.0.0',
     'numpy<1.17',
     'rundoc>=0.4.3',

From 2909c03289df305113eae94d41f779263d25f3f6 Mon Sep 17 00:00:00 2001
From: sarahmish <sarahalnegheimish@gmail.com>
Date: Wed, 29 Jul 2020 00:46:12 +0300
Subject: [PATCH 111/160] cap sphinx

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 85020231..4048cbbb 100644
--- a/setup.py
+++ b/setup.py
@@ -47,8 +47,8 @@
     'watchdog>=0.8.3',
 
     # docs
-    'm2r>=0.2.0',
-    'Sphinx>=1.7.1',
+    'm2r>=0.2.0,<0.3',
+    'Sphinx>=1.7.1,<3',
     'sphinx_rtd_theme>=0.2.4',
     'graphviz>=0.9',
     'ipython>=6.5.0',

From 22a955f47a60e778b50de752f232345e13aac64b Mon Sep 17 00:00:00 2001
From: sarahmish <sarahalnegheimish@gmail.com>
Date: Wed, 29 Jul 2020 01:18:21 +0300
Subject: [PATCH 112/160] cap isort

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 4048cbbb..57c623f0 100644
--- a/setup.py
+++ b/setup.py
@@ -57,8 +57,8 @@
     'docutils<0.15,>=0.10',    # botocore incompatibility with 0.15
 
     # style check
-    'flake8>=3.5.0',
-    'isort>=4.3.4',
+    'flake8>=3.5.0,<3.8',
+    'isort>=4.3.4<5',
 
     # fix style issues
     'autoflake>=1.2',  # keep this after flake8 to avoid

From 444f301f641e03150490ade67604e4cc9a23703b Mon Sep 17 00:00:00 2001
From: sarahmish <sarahalnegheimish@gmail.com>
Date: Wed, 29 Jul 2020 01:53:42 +0300
Subject: [PATCH 113/160] cap isort (properly)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 57c623f0..c5cf4015 100644
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,7 @@
 
     # style check
     'flake8>=3.5.0,<3.8',
-    'isort>=4.3.4<5',
+    'isort>=4.3.4,<5',
 
     # fix style issues
     'autoflake>=1.2',  # keep this after flake8 to avoid

From e2b6eb3e0d41717579a2949598af411ddbad1a47 Mon Sep 17 00:00:00 2001
From: sarahmish <sarahalnegheimish@gmail.com>
Date: Thu, 30 Jul 2020 04:15:46 +0300
Subject: [PATCH 114/160] debug dictionary passing + added debug tests

---
 mlblocks/mlpipeline.py   |  36 +++----
 tests/test_mlpipeline.py | 198 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 216 insertions(+), 18 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 8e5d0629..8367b327 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -612,7 +612,7 @@ def _update_outputs(self, variable_name, output_variables, outputs, value):
             index = output_variables.index(variable_name)
             outputs[index] = deepcopy(value)
 
-    def _fit_block(self, block, block_name, context, debug=False):
+    def _fit_block(self, block, block_name, context, debug=None):
         """Get the block args from the context and fit the block."""
         LOGGER.debug("Fitting block %s", block_name)
         try:
@@ -621,12 +621,11 @@ def _fit_block(self, block, block_name, context, debug=False):
             block.fit(**fit_args)
             elapsed = datetime.utcnow() - start
 
-            if debug:
-                debug_info = {
+            if debug is not None:
+                debug["fit"][block_name] = {
                     "elapsed": elapsed.total_seconds(),
                     "input": fit_args
                 }
-                return debug_info
 
         except Exception:
             if self.verbose:
@@ -634,7 +633,7 @@ def _fit_block(self, block, block_name, context, debug=False):
 
             raise
 
-    def _produce_block(self, block, block_name, context, output_variables, outputs, debug=False):
+    def _produce_block(self, block, block_name, context, output_variables, outputs, debug=None):
         """Get the block args from the context and produce the block.
 
         Afterwards, set the block outputs back into the context and update
@@ -658,13 +657,17 @@ def _produce_block(self, block, block_name, context, output_variables, outputs,
                         variable_name = '{}.{}'.format(block_name, key)
                         self._update_outputs(variable_name, output_variables, outputs, value)
 
-            if debug:
-                debug_info = {
+            if debug is not None:
+                record = {
                     "elapsed": elapsed.total_seconds(),
                     "input": produce_args,
                     "output": outputs_dict
                 }
-                return debug_info
+
+                if "fit" in debug.keys():
+                    debug["produce"][block_name] = record
+                else:
+                    debug[block_name] = record
 
         except Exception:
             if self.verbose:
@@ -745,15 +748,11 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
                     LOGGER.debug("Skipping block %s fit", block_name)
                     continue
 
-            out = self._fit_block(block, block_name, context, debug)
-            if debug:
-                debug_info["fit"][block_name] = out
+            self._fit_block(block, block_name, context, debug_info)
 
             if (block_name != self._last_block_name) or (block_name in output_blocks):
-                out = self._produce_block(
-                    block, block_name, context, output_variables, outputs, debug)
-                if debug:
-                    debug_info["produce"][block_name] = out
+                self._produce_block(
+                    block, block_name, context, output_variables, outputs, debug_info)
 
                 # We already captured the output from this block
                 if block_name in output_blocks:
@@ -839,9 +838,7 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs)
                     LOGGER.debug("Skipping block %s produce", block_name)
                     continue
 
-            out = self._produce_block(block, block_name, context, output_variables, outputs, debug)
-            if debug:
-                debug_info[block_name] = out
+            self._produce_block(block, block_name, context, output_variables, outputs, debug_info)
 
             # We already captured the output from this block
             if block_name in output_blocks:
@@ -860,6 +857,9 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs)
 
                 return result
 
+        if debug:
+            return debug_info
+
         if start_:
             # We skipped all the blocks up to the end
             raise ValueError('Unknown block name: {}'.format(start_))
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 340a3838..25a90edb 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -681,6 +681,204 @@ def test_get_inputs_no_fit(self):
 
         assert inputs == expected
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_fit_no_debug(self):
+        mlpipeline = MLPipeline(['a_primitive'])
+        mlpipeline.blocks['a_primitive#1'].fit_args = [
+            {
+                'name': 'fit_input',
+                'type': 'whatever'
+            }
+        ]
+
+        returned = mlpipeline.fit(debug=False)
+
+        assert returned is None
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_fit_debug(self):
+        mlpipeline = MLPipeline(['a_primitive'])
+        mlpipeline.blocks['a_primitive#1'].fit_args = [
+            {
+                'name': 'fit_input',
+                'type': 'whatever'
+            }
+        ]
+
+        expected_return = dict()
+        expected_return["fit"] = {
+            "a_primitive#1": {
+                "elapsed": 0,
+                "input": {
+                    "whatever"
+                }
+            }
+        }
+
+        returned = mlpipeline.fit(debug=True)
+
+        print(returned)
+        assert isinstance(returned, dict)
+        assert set(returned.keys()) == set(expected_return.keys())  # fit / produce
+        assert set(returned["fit"].keys()) == set(expected_return["fit"].keys())  # block name
+
+        for block_name, dictionary in expected_return["fit"].items():
+            assert set(returned["fit"][block_name].keys()) == set(dictionary.keys())
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_fit_produce_debug(self):
+        outputs = {
+            'default': [
+                {
+                    'name': 'a_name',
+                    'variable': 'a_primitive#1.a_variable',
+                    'type': 'a_type',
+                }
+            ]
+        }
+        mlpipeline = MLPipeline(['a_primitive'], outputs=outputs)
+        mlpipeline.blocks['a_primitive#1'].fit_args = [
+            {
+                'name': 'fit_input',
+                'type': 'whatever'
+            }
+        ]
+
+        mlpipeline.blocks['a_primitive#1'].produce_args = [
+            {
+                'name': 'input',
+                'type': 'whatever'
+            }
+        ]
+
+        mlpipeline.blocks['a_primitive#1'].produce_output = [
+            {
+                'name': 'a_name',
+                'type': 'a_type'
+            }
+        ]
+
+        expected_return = dict()
+        expected_return["fit"] = {
+            "a_primitive#1": {
+                "elapsed": 0,
+                "input": {
+                    "whatever"
+                }
+            }
+        }
+        expected_return["produce"] = {
+            "a_primitive#1": {
+                "elapsed": 0,
+                "input": {
+                    "whatever"
+                },
+                "output": {
+                    "whatever"
+                }
+            }
+        }
+
+        returned, debug_returned = mlpipeline.fit(output_='default', debug=True)
+
+        assert len([returned]) == len(outputs["default"])
+        assert isinstance(debug_returned, dict)
+        assert set(debug_returned.keys()) == set(expected_return.keys())  # fit / produce
+        assert set(debug_returned["fit"].keys()) == set(expected_return["fit"].keys())
+        assert set(debug_returned["produce"].keys()) == set(expected_return["produce"].keys())
+
+        for block_name, dictionary in expected_return["fit"].items():
+            assert set(debug_returned["fit"][block_name].keys()) == set(dictionary.keys())
+
+        for block_name, dictionary in expected_return["produce"].items():
+            assert set(debug_returned["produce"][block_name].keys()) == set(dictionary.keys())
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_predict_no_debug(self):
+        outputs = {
+            'default': [
+                {
+                    'name': 'a_name',
+                    'variable': 'a_primitive#1.a_variable',
+                    'type': 'a_type',
+                },
+                {
+                    'name': 'b_name',
+                    'variable': 'a_primitive#1.b_variable',
+                    'type': 'b_type',
+                },
+            ]
+        }
+        mlpipeline = MLPipeline(['a_primitive'], outputs=outputs)
+        mlpipeline.blocks['a_primitive#1'].produce_args = [
+            {
+                'name': 'input',
+                'type': 'whatever'
+            }
+        ]
+
+        mlpipeline.blocks['a_primitive#1'].produce_output = [
+            {
+                'name': 'a_name',
+                'type': 'a_type'
+            },
+            {
+                'name': 'b_name',
+                'type': 'b_type'
+            }
+        ]
+
+        returned = mlpipeline.predict(debug=False)
+        assert len(returned) == len(outputs["default"])
+        for returned_output, expected_output in zip(returned, outputs["default"]):
+            assert returned_output == expected_output["variable"]
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_predict_debug(self):
+        outputs = {
+            'default': [
+                {
+                    'name': 'a_name',
+                    'variable': 'a_primitive#1.a_variable',
+                    'type': 'a_type',
+                }
+            ]
+        }
+        mlpipeline = MLPipeline(['a_primitive'], outputs=outputs)
+        mlpipeline.blocks['a_primitive#1'].produce_args = [
+            {
+                'name': 'input',
+                'type': 'whatever'
+            }
+        ]
+
+        mlpipeline.blocks['a_primitive#1'].produce_output = [
+            {
+                'name': 'a_name',
+                'type': 'a_type'
+            }
+        ]
+
+        expected_return = dict()
+        expected_return = {
+            "a_primitive#1": {
+                "elapsed": 0,
+                "input": {
+                    "whatever"
+                },
+                "output": {
+                    "whatever"
+                }
+            }
+        }
+        returned, debug_returned = mlpipeline.predict(debug=True)
+        assert len([returned]) == len(outputs["default"])
+        assert isinstance(debug_returned, dict)
+        assert set(debug_returned.keys()) == set(expected_return.keys())
+
+        for block_name, dictionary in expected_return.items():
+            assert set(debug_returned[block_name].keys()) == set(dictionary.keys())
+
     def test_fit(self):
         pass
 

From 54c6698a6df91fe646071f5f971224beefa12f1a Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 16 Sep 2020 16:58:57 +0200
Subject: [PATCH 115/160] Update dependencies and tutorials

---
 .travis.yml                                   |   6 +-
 Makefile                                      |  23 +-
 .../tutorials/1. Using and MLPipeline.ipynb   |   2 +-
 .../2. Finding and Loading a Pipeline.ipynb   |  35 +-
 .... Setting MLPipeline Hyperparameters.ipynb |   2 +-
 ...ial execution and pipeline debugging.ipynb |   2 +-
 .../6. Flexible outputs specification.ipynb   |  18 +-
 examples/tutorials/7. Tuning a Pipeline.ipynb |  46 +-
 ...or the best pipeline with BTBSession.ipynb | 533 +++++++-----------
 setup.py                                      |  33 +-
 tests/test_mlpipeline.py                      |   3 +-
 tox.ini                                       |  29 +-
 12 files changed, 327 insertions(+), 405 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 136bd690..7c63a880 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,4 +1,5 @@
 # Config file for automatic testing at travis-ci.org
+dist: bionic
 language: python
 python:
   - 3.6
@@ -6,8 +7,9 @@ python:
 
 # Command to install dependencies
 install:
-   - pip install -U tox-travis codecov
-   - sudo apt-get install graphviz
+  - sudo apt-get update
+  - sudo apt-get install graphviz pandoc
+  - pip install -U tox-travis codecov
 
 # Command to run tests
 script: tox
diff --git a/Makefile b/Makefile
index eb422682..6cc80705 100644
--- a/Makefile
+++ b/Makefile
@@ -110,13 +110,30 @@ lint-docs: ## check docs formatting with doc8 and pydocstyle
 
 # TEST TARGETS
 
-.PHONY: test
-test: ## run tests quickly with the default Python
+.PHONY: test-unit
+test-unit: ## run tests quickly with the default Python
 	python -m pytest --cov=mlblocks
 
 .PHONY: test-readme
 test-readme: ## run the readme snippets
-	rundoc run --single-session python3 -t python3 README.md
+	rm -rf tests/readme_test && mkdir tests/readme_test
+	cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md
+	rm -rf tests/readme_test
+
+.PHONY: test-tutorials
+test-tutorials: ## run the tutorial notebooks
+	find examples/tutorials -path "*/.ipynb_checkpoints" -prune -false -o -name "*.ipynb" -exec \
+		jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --stdout --to html {} > /dev/null \;
+
+.PHONY: test
+test: test-unit test-readme ## test everything that needs test dependencies
+
+.PHONY: check-dependencies
+check-dependencies: ## test if there are any broken dependencies
+	pip check
+
+.PHONY: test-devel
+test-devel: check-dependencies lint docs ## test everything that needs development dependencies
 
 .PHONY: test-all
 test-all: ## run tests on every Python version with tox
diff --git a/examples/tutorials/1. Using and MLPipeline.ipynb b/examples/tutorials/1. Using and MLPipeline.ipynb
index 733fb42d..dab130ea 100644
--- a/examples/tutorials/1. Using and MLPipeline.ipynb	
+++ b/examples/tutorials/1. Using and MLPipeline.ipynb	
@@ -625,7 +625,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.9"
   }
  },
  "nbformat": 4,
diff --git a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb
index 8df76259..7f14662a 100644
--- a/examples/tutorials/2. Finding and Loading a Pipeline.ipynb	
+++ b/examples/tutorials/2. Finding and Loading a Pipeline.ipynb	
@@ -52,18 +52,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['image.classification.hog.random_forest',\n",
-       " 'image.classification.hog.xgboost',\n",
-       " 'image.classification.resnet50.xgboost']"
+       "['image.classification.hog.rf',\n",
+       " 'image.classification.hog.xgb',\n",
+       " 'image.classification.resnet50.xgb',\n",
+       " 'keras.Sequential.SingleLayerCNNImageClassifier',\n",
+       " 'keras.Sequential.VGGCNNClassifier']"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -72,7 +74,7 @@
     "from mlblocks.discovery import find_pipelines\n",
     "\n",
     "filters = {\n",
-    "    'metadata.data_modality': 'image',\n",
+    "    'metadata.data_type': 'image',\n",
     "    'metadata.task_type': 'classification',\n",
     "}\n",
     "\n",
@@ -89,13 +91,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using TensorFlow backend.\n",
+      "2020-09-16 16:03:19,939 - WARNING - tensorflow - From /home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "If using Keras pass *_constraint arguments to layers.\n",
+      "2020-09-16 16:03:20,025 - WARNING - tensorflow - From /home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4070: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "from mlblocks import MLPipeline\n",
     "\n",
-    "pipeline = MLPipeline('image.classification.resnet50.xgboost')"
+    "pipeline = MLPipeline('image.classification.resnet50.xgb')"
    ]
   }
  ],
@@ -115,7 +130,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.9"
   }
  },
  "nbformat": 4,
diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
index 725226f7..5b7944b5 100644
--- a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb	
+++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb	
@@ -204,7 +204,7 @@
     "    }\n",
     "}\n",
     "pipeline = MLPipeline(\n",
-    "    'single_table.classification.categorical_encoder.xgboost',\n",
+    "    primitives,\n",
     "    init_params=init_params\n",
     ")"
    ]
diff --git a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb
index 2e21c85b..57b2b43c 100644
--- a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb	
+++ b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb	
@@ -704,7 +704,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.9"
   }
  },
  "nbformat": 4,
diff --git a/examples/tutorials/6. Flexible outputs specification.ipynb b/examples/tutorials/6. Flexible outputs specification.ipynb
index 3dc3686f..ca1048dd 100644
--- a/examples/tutorials/6. Flexible outputs specification.ipynb	
+++ b/examples/tutorials/6. Flexible outputs specification.ipynb	
@@ -380,7 +380,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -400,7 +400,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -418,7 +418,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -439,7 +439,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -454,7 +454,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -463,7 +463,7 @@
        "(24420, 108)"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -474,7 +474,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -483,7 +483,7 @@
        "(24420, 108)"
       ]
      },
-     "execution_count": 29,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -509,7 +509,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.9"
   }
  },
  "nbformat": 4,
diff --git a/examples/tutorials/7. Tuning a Pipeline.ipynb b/examples/tutorials/7. Tuning a Pipeline.ipynb
index 8dbc4366..4b6eae24 100644
--- a/examples/tutorials/7. Tuning a Pipeline.ipynb	
+++ b/examples/tutorials/7. Tuning a Pipeline.ipynb	
@@ -58,7 +58,7 @@
    "source": [
     "from mlblocks import MLPipeline\n",
     "\n",
-    "template = MLPipeline('single_table.classification.categorical_encoder.xgboost')"
+    "template = MLPipeline('single_table.classification.xgb')"
    ]
   },
   {
@@ -204,7 +204,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -213,7 +213,7 @@
        "0.8686773872402614"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -238,7 +238,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -261,7 +261,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -284,7 +284,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -300,7 +300,7 @@
        " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -312,7 +312,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -337,7 +337,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -346,13 +346,15 @@
      "text": [
       "scoring pipeline 1\n",
       "scoring pipeline 2\n",
-      "New best found: 0.8722706212975673\n",
       "scoring pipeline 3\n",
       "scoring pipeline 4\n",
+      "New best found: 0.8642241881762839\n",
       "scoring pipeline 5\n",
       "scoring pipeline 6\n",
       "scoring pipeline 7\n",
+      "New best found: 0.8644390957265209\n",
       "scoring pipeline 8\n",
+      "New best found: 0.8679095503945804\n",
       "scoring pipeline 9\n",
       "scoring pipeline 10\n"
      ]
@@ -386,23 +388,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
-       "  'max_labels'): 40,\n",
-       " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
-       " ('xgboost.XGBClassifier#1', 'n_estimators'): 119,\n",
-       " ('xgboost.XGBClassifier#1', 'max_depth'): 4,\n",
-       " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1971742459927317,\n",
-       " ('xgboost.XGBClassifier#1', 'gamma'): 0.22575517380871246,\n",
-       " ('xgboost.XGBClassifier#1', 'min_child_weight'): 4}"
+       "  'max_labels'): 39,\n",
+       " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'most_frequent',\n",
+       " ('xgboost.XGBClassifier#1', 'n_estimators'): 70,\n",
+       " ('xgboost.XGBClassifier#1', 'max_depth'): 6,\n",
+       " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.07406443671152008,\n",
+       " ('xgboost.XGBClassifier#1', 'gamma'): 0.9244108160038952,\n",
+       " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -422,7 +424,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -431,7 +433,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -455,7 +457,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.9"
   }
  },
  "nbformat": 4,
diff --git a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
index a1f0c0f4..1fb4d7ca 100644
--- a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb	
+++ b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb	
@@ -103,11 +103,7 @@
    "source": [
     "from mlblocks.discovery import find_pipelines\n",
     "\n",
-    "filters = {\n",
-    "    'metadata.data_modality': 'single_table',\n",
-    "    'metadata.task_type': 'classification'\n",
-    "}\n",
-    "templates = find_pipelines(filters=filters)"
+    "templates = find_pipelines('single_table.classification')"
    ]
   },
   {
@@ -118,13 +114,9 @@
     {
      "data": {
       "text/plain": [
-       "['single_table.classification.categorical_encoder.logit',\n",
-       " 'single_table.classification.categorical_encoder.random_forest',\n",
-       " 'single_table.classification.categorical_encoder.xgboost',\n",
-       " 'single_table.classification.mlprimitives.logit',\n",
-       " 'single_table.classification.mlprimitives.random_forest',\n",
-       " 'single_table.classification.mlprimitives.xgboost',\n",
-       " 'single_table.classification.mlprimitives_text.xgboost']"
+       "['single_table.classification',\n",
+       " 'single_table.classification.text',\n",
+       " 'single_table.classification.xgb']"
       ]
      },
      "execution_count": 4,
@@ -165,7 +157,7 @@
     {
      "data": {
       "text/plain": [
-       "<mlblocks.mlpipeline.MLPipeline at 0x7fd038c14e80>"
+       "<mlblocks.mlpipeline.MLPipeline at 0x7fc8721f0828>"
       ]
      },
      "execution_count": 6,
@@ -174,7 +166,7 @@
     }
    ],
    "source": [
-    "templates_dict['single_table.classification.mlprimitives.xgboost']"
+    "templates_dict['single_table.classification.xgb']"
    ]
   },
   {
@@ -250,12 +242,6 @@
       "text/plain": [
        "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
        "  'max_labels'): {'type': 'int', 'default': 0, 'range': [0, 100]},\n",
-       " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "  'lowercase'): {'type': 'bool', 'default': True},\n",
-       " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "  'binary'): {'type': 'bool', 'default': True},\n",
-       " ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "  'max_features'): {'type': 'int', 'default': 1000, 'range': [1, 10000]},\n",
        " ('sklearn.impute.SimpleImputer#1', 'strategy'): {'type': 'str',\n",
        "  'default': 'mean',\n",
        "  'values': ['mean', 'median', 'most_frequent', 'constant']},\n",
@@ -282,7 +268,7 @@
     }
    ],
    "source": [
-    "tunables['single_table.classification.mlprimitives.xgboost']"
+    "tunables['single_table.classification.xgb']"
    ]
   },
   {
@@ -296,7 +282,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -322,13 +308,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
+   "execution_count": 11,
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fe9bb1cfdb2f48d4b6c8614ae1d357a1",
+       "model_id": "342fe40f08024adcb5b60eea25f49d37",
        "version_major": 2,
        "version_minor": 0
       },
@@ -343,18 +331,98 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2020-01-23 20:16:01,059 - INFO - session - Creating Tunable instance from dict.\n",
-      "2020-01-23 20:16:01,060 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.logit\n",
-      "2020-01-23 20:16:03,274 - INFO - session - New optimal found: single_table.classification.categorical_encoder.logit - 0.7975185708718643\n",
-      "2020-01-23 20:16:03,284 - INFO - session - Creating Tunable instance from dict.\n",
-      "2020-01-23 20:16:03,285 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.random_forest\n",
-      "2020-01-23 20:16:05,584 - INFO - session - Creating Tunable instance from dict.\n",
-      "2020-01-23 20:16:05,585 - INFO - session - Obtaining default configuration for single_table.classification.categorical_encoder.xgboost\n",
-      "2020-01-23 20:16:10,613 - INFO - session - New optimal found: single_table.classification.categorical_encoder.xgboost - 0.8639171383183359\n",
-      "2020-01-23 20:16:10,617 - INFO - session - Creating Tunable instance from dict.\n",
-      "2020-01-23 20:16:10,618 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.logit\n",
-      "2020-01-23 20:16:13,090 - INFO - session - Creating Tunable instance from dict.\n",
-      "2020-01-23 20:16:13,093 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.random_forest\n"
+      "2020-09-16 16:32:40,826 - INFO - btb.session - Creating Tunable instance from dict.\n",
+      "2020-09-16 16:32:40,827 - INFO - btb.session - Obtaining default configuration for single_table.classification\n",
+      "2020-09-16 16:32:46,432 - INFO - btb.session - New optimal found: single_table.classification - 0.8639171383183359\n",
+      "2020-09-16 16:32:46,435 - INFO - btb.session - Creating Tunable instance from dict.\n",
+      "2020-09-16 16:32:46,436 - INFO - btb.session - Obtaining default configuration for single_table.classification.text\n",
+      "2020-09-16 16:32:46,583 - ERROR - mlblocks.mlpipeline - Exception caught producing MLBlock mlprimitives.custom.text.TextCleaner#1\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n",
+      "    return self._engine.get_loc(key)\n",
+      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "KeyError: 'text'\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlpipeline.py\", line 645, in _produce_block\n",
+      "    block_outputs = block.produce(**produce_args)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlblock.py\", line 322, in produce\n",
+      "    return getattr(self.instance, self.produce_method)(**produce_kwargs)\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/mlprimitives/custom/text.py\", line 111, in produce\n",
+      "    texts = X[self.column]\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n",
+      "    indexer = self.columns.get_loc(key)\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n",
+      "    return self._engine.get_loc(self._maybe_cast_indexer(key))\n",
+      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "KeyError: 'text'\n",
+      "2020-09-16 16:32:46,586 - ERROR - btb.session - Proposal 2 - single_table.classification.text crashed with the following configuration: ('mlprimitives.custom.text.TextCleaner#1', 'lower'): True\n",
+      "('mlprimitives.custom.text.TextCleaner#1', 'accents'): True\n",
+      "('mlprimitives.custom.text.TextCleaner#1', 'stopwords'): True\n",
+      "('mlprimitives.custom.text.TextCleaner#1', 'non_alpha'): True\n",
+      "('mlprimitives.custom.text.TextCleaner#1', 'single_chars'): True\n",
+      "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'lowercase'): True\n",
+      "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'binary'): True\n",
+      "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'max_features'): 1000\n",
+      "('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'n_estimators'): 10\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'criterion'): gini\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'max_features'): None\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'max_depth'): 1\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_split'): 2\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_leaf'): 1\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'min_weight_fraction_leaf'): 0.0\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'max_leaf_nodes'): 2\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'min_impurity_decrease'): 0.0\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True\n",
+      "('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n",
+      "    return self._engine.get_loc(key)\n",
+      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "KeyError: 'text'\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/btb/session.py\", line 336, in run\n",
+      "    score = self._scorer(tunable_name, config)\n",
+      "  File \"<ipython-input-7-067b925bbee5>\", line 11, in cross_validate\n",
+      "    pipeline.fit(X_train, y_train)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlpipeline.py\", line 754, in fit\n",
+      "    block, block_name, context, output_variables, outputs, debug_info)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlpipeline.py\", line 645, in _produce_block\n",
+      "    block_outputs = block.produce(**produce_args)\n",
+      "  File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlblock.py\", line 322, in produce\n",
+      "    return getattr(self.instance, self.produce_method)(**produce_kwargs)\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/mlprimitives/custom/text.py\", line 111, in produce\n",
+      "    texts = X[self.column]\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n",
+      "    indexer = self.columns.get_loc(key)\n",
+      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n",
+      "    return self._engine.get_loc(self._maybe_cast_indexer(key))\n",
+      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "KeyError: 'text'\n",
+      "2020-09-16 16:32:46,587 - WARNING - btb.session - Too many errors: 1. Removing tunable single_table.classification.text\n",
+      "2020-09-16 16:32:46,589 - INFO - btb.session - Creating Tunable instance from dict.\n",
+      "2020-09-16 16:32:46,589 - INFO - btb.session - Obtaining default configuration for single_table.classification.xgb\n",
+      "2020-09-16 16:32:52,100 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+      "2020-09-16 16:33:28,900 - INFO - btb.session - New optimal found: single_table.classification - 0.8728234138413778\n",
+      "2020-09-16 16:33:28,904 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n"
      ]
     },
     {
@@ -367,20 +435,26 @@
     {
      "data": {
       "text/plain": [
-       "{'id': '51a54054874dd7a83ff0e785ffdfee3b',\n",
-       " 'name': 'single_table.classification.categorical_encoder.xgboost',\n",
+       "{'id': '7e662f9b90f0e123939b7532ecc221c7',\n",
+       " 'name': 'single_table.classification',\n",
        " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
-       "   'max_labels'): 0,\n",
-       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
-       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
+       "   'max_labels'): 63,\n",
+       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "   'lowercase'): True,\n",
+       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "   'binary'): True,\n",
+       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "   'max_features'): 7315,\n",
+       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'median',\n",
+       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 879,\n",
        "  ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
-       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
-       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n",
-       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n",
-       " 'score': 0.8639171383183359}"
+       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23231879890615814,\n",
+       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.5474914147721585,\n",
+       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 3},\n",
+       " 'score': 0.8728234138413778}"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -411,26 +485,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'id': '51a54054874dd7a83ff0e785ffdfee3b',\n",
-       " 'name': 'single_table.classification.categorical_encoder.xgboost',\n",
+       "{'id': '7e662f9b90f0e123939b7532ecc221c7',\n",
+       " 'name': 'single_table.classification',\n",
        " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
-       "   'max_labels'): 0,\n",
-       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
-       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
+       "   'max_labels'): 63,\n",
+       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "   'lowercase'): True,\n",
+       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "   'binary'): True,\n",
+       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "   'max_features'): 7315,\n",
+       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'median',\n",
+       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 879,\n",
        "  ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
-       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
-       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n",
-       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n",
-       " 'score': 0.8639171383183359}"
+       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23231879890615814,\n",
+       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.5474914147721585,\n",
+       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 3},\n",
+       " 'score': 0.8728234138413778}"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -455,7 +535,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 13,
    "metadata": {
     "scrolled": false
    },
@@ -463,7 +543,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a76ce44e1173496e99baaf7ee39a3df7",
+       "model_id": "8dd5d4626f304c279b2b368a671b6cb7",
        "version_major": 2,
        "version_minor": 0
       },
@@ -478,219 +558,27 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2020-01-23 20:17:59,163 - INFO - session - Creating Tunable instance from dict.\n",
-      "2020-01-23 20:17:59,163 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives.xgboost\n",
-      "2020-01-23 20:18:04,640 - INFO - session - Creating Tunable instance from dict.\n",
-      "2020-01-23 20:18:04,640 - INFO - session - Obtaining default configuration for single_table.classification.mlprimitives_text.xgboost\n",
-      "2020-01-23 20:18:04,779 - ERROR - mlpipeline - Exception caught producing MLBlock mlprimitives.custom.text.TextCleaner#1\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n",
-      "    return self._engine.get_loc(key)\n",
-      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
-      "KeyError: 'text'\n",
-      "\n",
-      "During handling of the above exception, another exception occurred:\n",
-      "\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 635, in _produce_block\n",
-      "    block_outputs = block.produce(**produce_args)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 322, in produce\n",
-      "    return getattr(self.instance, self.produce_method)(**produce_kwargs)\n",
-      "  File \"/home/xals/Projects/MIT/MLPrimitives/mlprimitives/custom/text.py\", line 111, in produce\n",
-      "    texts = X[self.column]\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n",
-      "    indexer = self.columns.get_loc(key)\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n",
-      "    return self._engine.get_loc(self._maybe_cast_indexer(key))\n",
-      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
-      "KeyError: 'text'\n",
-      "2020-01-23 20:18:04,799 - ERROR - session - Proposal 7 - single_table.classification.mlprimitives_text.xgboost crashed with the following configuration: ('mlprimitives.custom.text.TextCleaner#1', 'lower'): True\n",
-      "('mlprimitives.custom.text.TextCleaner#1', 'accents'): True\n",
-      "('mlprimitives.custom.text.TextCleaner#1', 'stopwords'): True\n",
-      "('mlprimitives.custom.text.TextCleaner#1', 'non_alpha'): True\n",
-      "('mlprimitives.custom.text.TextCleaner#1', 'single_chars'): True\n",
-      "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'lowercase'): True\n",
-      "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'binary'): True\n",
-      "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'max_features'): 1000\n",
-      "('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n",
-      "('sklearn.ensemble.RandomForestClassifier#1', 'n_estimators'): 10\n",
-      "('sklearn.ensemble.RandomForestClassifier#1', 'criterion'): gini\n",
-      "('sklearn.ensemble.RandomForestClassifier#1', 'max_features'): None\n",
-      "('sklearn.ensemble.RandomForestClassifier#1', 'max_depth'): 1\n",
-      "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_split'): 2\n",
-      "('sklearn.ensemble.RandomForestClassifier#1', 'min_samples_leaf'): 1\n",
-      "('sklearn.ensemble.RandomForestClassifier#1', 'min_weight_fraction_leaf'): 0.0\n",
-      "('sklearn.ensemble.RandomForestClassifier#1', 'max_leaf_nodes'): 2\n",
-      "('sklearn.ensemble.RandomForestClassifier#1', 'min_impurity_decrease'): 0.0\n",
-      "('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True\n",
-      "('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n",
-      "    return self._engine.get_loc(key)\n",
-      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
-      "KeyError: 'text'\n",
-      "\n",
-      "During handling of the above exception, another exception occurred:\n",
-      "\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n",
-      "    score = self.scorer(tunable_name, config)\n",
-      "  File \"<ipython-input-7-067b925bbee5>\", line 11, in cross_validate\n",
-      "    pipeline.fit(X_train, y_train)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 722, in fit\n",
-      "    self._produce_block(block, block_name, context, output_variables, outputs)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 635, in _produce_block\n",
-      "    block_outputs = block.produce(**produce_args)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 322, in produce\n",
-      "    return getattr(self.instance, self.produce_method)(**produce_kwargs)\n",
-      "  File \"/home/xals/Projects/MIT/MLPrimitives/mlprimitives/custom/text.py\", line 111, in produce\n",
-      "    texts = X[self.column]\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n",
-      "    indexer = self.columns.get_loc(key)\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n",
-      "    return self._engine.get_loc(self._maybe_cast_indexer(key))\n",
-      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
-      "KeyError: 'text'\n",
-      "2020-01-23 20:18:04,801 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.mlprimitives_text.xgboost\n",
-      "2020-01-23 20:18:04,803 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
-      "2020-01-23 20:18:22,026 - INFO - session - New optimal found: single_table.classification.categorical_encoder.xgboost - 0.8687079630193402\n",
-      "2020-01-23 20:18:22,031 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
-      "2020-01-23 20:19:13,106 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.logit\n",
-      "2020-01-23 20:19:13,334 - ERROR - mlpipeline - Exception caught fitting MLBlock sklearn.linear_model.LogisticRegression#1\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
-      "    block.fit(**fit_args)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
-      "    getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
-      "    solver = _check_solver(self.solver, self.penalty, self.dual)\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
-      "    \"got %s penalty.\" % (solver, penalty))\n",
-      "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n",
-      "2020-01-23 20:19:13,339 - ERROR - session - Proposal 10 - single_table.classification.categorical_encoder.logit crashed with the following configuration: ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 29\n",
-      "('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): False\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 71156\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'solver'): newton-cg\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'penalty'): l1\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'C'): 40.699406362214916\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'multi_class'): multinomial\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 933.5409791334005\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.0017748534037681438\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'dual'): True\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n",
-      "    score = self.scorer(tunable_name, config)\n",
-      "  File \"<ipython-input-7-067b925bbee5>\", line 11, in cross_validate\n",
-      "    pipeline.fit(X_train, y_train)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 719, in fit\n",
-      "    self._fit_block(block, block_name, context)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
-      "    block.fit(**fit_args)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
-      "    getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
-      "    solver = _check_solver(self.solver, self.penalty, self.dual)\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
-      "    \"got %s penalty.\" % (solver, penalty))\n",
-      "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2020-01-23 20:19:13,340 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.categorical_encoder.logit\n",
-      "2020-01-23 20:19:13,343 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.logit\n",
-      "2020-01-23 20:19:26,076 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n",
-      "2020-01-23 20:19:31,573 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n",
-      "2020-01-23 20:19:34,763 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
-      "2020-01-23 20:20:15,775 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
-      "2020-01-23 20:21:49,655 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.logit\n",
-      "2020-01-23 20:21:49,946 - ERROR - mlpipeline - Exception caught fitting MLBlock sklearn.linear_model.LogisticRegression#1\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
-      "    block.fit(**fit_args)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
-      "    getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
-      "    solver = _check_solver(self.solver, self.penalty, self.dual)\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
-      "    \"got %s penalty.\" % (solver, penalty))\n",
-      "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n",
-      "2020-01-23 20:21:49,948 - ERROR - session - Proposal 16 - single_table.classification.mlprimitives.logit crashed with the following configuration: ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 97\n",
-      "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'lowercase'): True\n",
-      "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'binary'): True\n",
-      "('mlprimitives.custom.feature_extraction.StringVectorizer#1', 'max_features'): 4707\n",
-      "('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): True\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 26014\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'solver'): newton-cg\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'penalty'): l1\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'C'): 34.878827238511434\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'multi_class'): multinomial\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 406.1952335959628\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.008653762646621075\n",
-      "('sklearn.linear_model.LogisticRegression#1', 'dual'): True\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/btb/session.py\", line 272, in run\n",
-      "    score = self.scorer(tunable_name, config)\n",
-      "  File \"<ipython-input-7-067b925bbee5>\", line 11, in cross_validate\n",
-      "    pipeline.fit(X_train, y_train)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 719, in fit\n",
-      "    self._fit_block(block, block_name, context)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlpipeline.py\", line 619, in _fit_block\n",
-      "    block.fit(**fit_args)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks/mlblocks/mlblock.py\", line 302, in fit\n",
-      "    getattr(self.instance, self.fit_method)(**fit_kwargs)\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 1280, in fit\n",
-      "    solver = _check_solver(self.solver, self.penalty, self.dual)\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/linear_model/logistic.py\", line 447, in _check_solver\n",
-      "    \"got %s penalty.\" % (solver, penalty))\n",
-      "ValueError: Solver newton-cg supports only l2 penalties, got l1 penalty.\n",
-      "2020-01-23 20:21:49,951 - WARNING - session - Too many errors: 1. Removing tunable single_table.classification.mlprimitives.logit\n",
-      "2020-01-23 20:21:49,953 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n",
-      "2020-01-23 20:22:23,153 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n",
-      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
-      "  warn(\"Some inputs do not have OOB scores. \"\n",
-      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
-      "  predictions[k].sum(axis=1)[:, np.newaxis])\n",
-      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
-      "  warn(\"Some inputs do not have OOB scores. \"\n",
-      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
-      "  predictions[k].sum(axis=1)[:, np.newaxis])\n",
-      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
-      "  warn(\"Some inputs do not have OOB scores. \"\n",
-      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
-      "  predictions[k].sum(axis=1)[:, np.newaxis])\n",
-      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
-      "  warn(\"Some inputs do not have OOB scores. \"\n",
-      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
-      "  predictions[k].sum(axis=1)[:, np.newaxis])\n",
-      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:458: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.\n",
-      "  warn(\"Some inputs do not have OOB scores. \"\n",
-      "/home/xals/.virtualenvs/MLBlocks/lib/python3.6/site-packages/sklearn/ensemble/forest.py:463: RuntimeWarning: invalid value encountered in true_divide\n",
-      "  predictions[k].sum(axis=1)[:, np.newaxis])\n",
-      "2020-01-23 20:22:24,832 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
-      "2020-01-23 20:22:46,026 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
-      "2020-01-23 20:22:53,670 - INFO - session - New optimal found: single_table.classification.mlprimitives.xgboost - 0.8739290413691612\n",
-      "2020-01-23 20:22:53,677 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n",
-      "2020-01-23 20:22:55,126 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.random_forest\n",
-      "2020-01-23 20:23:10,345 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.xgboost\n",
-      "2020-01-23 20:23:15,497 - INFO - session - Generating new proposal configuration for single_table.classification.mlprimitives.xgboost\n",
-      "2020-01-23 20:23:28,746 - INFO - session - Generating new proposal configuration for single_table.classification.categorical_encoder.random_forest\n"
+      "2020-09-16 16:34:46,679 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+      "2020-09-16 16:35:39,310 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+      "2020-09-16 16:36:53,519 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+      "2020-09-16 16:37:31,639 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+      "2020-09-16 16:37:34,254 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+      "2020-09-16 16:38:33,930 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+      "2020-09-16 16:38:46,228 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+      "2020-09-16 16:39:09,193 - INFO - btb.session - New optimal found: single_table.classification - 0.8730998313333643\n",
+      "2020-09-16 16:39:09,199 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+      "2020-09-16 16:40:06,793 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+      "2020-09-16 16:40:44,917 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+      "2020-09-16 16:41:19,357 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+      "2020-09-16 16:41:29,076 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+      "2020-09-16 16:41:46,742 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+      "2020-09-16 16:42:24,199 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+      "2020-09-16 16:42:37,998 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+      "2020-09-16 16:43:03,272 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+      "2020-09-16 16:44:01,301 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
+      "2020-09-16 16:44:12,500 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+      "2020-09-16 16:44:32,221 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
+      "2020-09-16 16:45:20,148 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n"
      ]
     },
     {
@@ -703,26 +591,26 @@
     {
      "data": {
       "text/plain": [
-       "{'id': 'd9854a57d48100da0f3584dc4490301f',\n",
-       " 'name': 'single_table.classification.mlprimitives.xgboost',\n",
+       "{'id': '52f65be5a78a6c557b8c5bf868bfdb7d',\n",
+       " 'name': 'single_table.classification',\n",
        " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
-       "   'max_labels'): 22,\n",
+       "   'max_labels'): 97,\n",
        "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
        "   'lowercase'): True,\n",
        "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
        "   'binary'): True,\n",
        "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "   'max_features'): 3863,\n",
-       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
-       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 193,\n",
+       "   'max_features'): 270,\n",
+       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'constant',\n",
+       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 556,\n",
        "  ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
-       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.29839198565184866,\n",
-       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.19826736959824165,\n",
-       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 4},\n",
-       " 'score': 0.8739290413691612}"
+       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.4023947989981499,\n",
+       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.9595910516937898,\n",
+       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 6},\n",
+       " 'score': 0.8730998313333643}"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -757,32 +645,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'id': 'd9854a57d48100da0f3584dc4490301f',\n",
-       " 'name': 'single_table.classification.mlprimitives.xgboost',\n",
+       "{'id': '52f65be5a78a6c557b8c5bf868bfdb7d',\n",
+       " 'name': 'single_table.classification',\n",
        " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
-       "   'max_labels'): 22,\n",
+       "   'max_labels'): 97,\n",
        "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
        "   'lowercase'): True,\n",
        "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
        "   'binary'): True,\n",
        "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "   'max_features'): 3863,\n",
-       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
-       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 193,\n",
+       "   'max_features'): 270,\n",
+       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'constant',\n",
+       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 556,\n",
        "  ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
-       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.29839198565184866,\n",
-       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.19826736959824165,\n",
-       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 4},\n",
-       " 'score': 0.8739290413691612}"
+       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.4023947989981499,\n",
+       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.9595910516937898,\n",
+       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 6},\n",
+       " 'score': 0.8730998313333643}"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -794,7 +682,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -818,7 +706,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 16,
    "metadata": {
     "scrolled": false
    },
@@ -826,25 +714,36 @@
     {
      "data": {
       "text/plain": [
-       "[{'id': '9dd9a11254f46b11ad42a12692b4965e',\n",
-       "  'name': 'single_table.classification.categorical_encoder.logit',\n",
+       "[{'id': 'c2cd14c7e9470448a0eeb58a3cce327f',\n",
+       "  'name': 'single_table.classification',\n",
        "  'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
        "    'max_labels'): 0,\n",
+       "   ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "    'lowercase'): True,\n",
+       "   ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "    'binary'): True,\n",
+       "   ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "    'max_features'): 1000,\n",
        "   ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
-       "   ('sklearn.linear_model.LogisticRegression#1', 'fit_intercept'): True,\n",
-       "   ('sklearn.linear_model.LogisticRegression#1', 'max_iter'): 100,\n",
-       "   ('sklearn.linear_model.LogisticRegression#1', 'solver'): 'liblinear',\n",
-       "   ('sklearn.linear_model.LogisticRegression#1', 'penalty'): 'l2',\n",
-       "   ('sklearn.linear_model.LogisticRegression#1', 'C'): 1.0,\n",
-       "   ('sklearn.linear_model.LogisticRegression#1', 'multi_class'): 'ovr',\n",
-       "   ('sklearn.linear_model.LogisticRegression#1', 'intercept_scaling'): 1.0,\n",
-       "   ('sklearn.linear_model.LogisticRegression#1', 'tol'): 0.0001,\n",
-       "   ('sklearn.linear_model.LogisticRegression#1', 'dual'): False},\n",
-       "  'score': 0.7975185708718643},\n",
-       " {'id': 'f7ef0814341cee4f05280077b9b3de9c',\n",
-       "  'name': 'single_table.classification.categorical_encoder.random_forest',\n",
-       "  'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
-       "    'max_labels'): 0,\n",
+       "   ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
+       "   ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
+       "   ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
+       "   ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n",
+       "   ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n",
+       "  'score': 0.8639171383183359},\n",
+       " {'id': 'adbd189a819483ddc869ceb94513b369',\n",
+       "  'name': 'single_table.classification.text',\n",
+       "  'config': {('mlprimitives.custom.text.TextCleaner#1', 'lower'): True,\n",
+       "   ('mlprimitives.custom.text.TextCleaner#1', 'accents'): True,\n",
+       "   ('mlprimitives.custom.text.TextCleaner#1', 'stopwords'): True,\n",
+       "   ('mlprimitives.custom.text.TextCleaner#1', 'non_alpha'): True,\n",
+       "   ('mlprimitives.custom.text.TextCleaner#1', 'single_chars'): True,\n",
+       "   ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "    'lowercase'): True,\n",
+       "   ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "    'binary'): True,\n",
+       "   ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
+       "    'max_features'): 1000,\n",
        "   ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
        "   ('sklearn.ensemble.RandomForestClassifier#1', 'n_estimators'): 10,\n",
        "   ('sklearn.ensemble.RandomForestClassifier#1', 'criterion'): 'gini',\n",
@@ -858,10 +757,10 @@
        "   ('sklearn.ensemble.RandomForestClassifier#1', 'min_impurity_decrease'): 0.0,\n",
        "   ('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True,\n",
        "   ('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False},\n",
-       "  'score': 0.7591904454179904}]"
+       "  'score': None}]"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -887,7 +786,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.9"
   }
  },
  "nbformat": 4,
diff --git a/setup.py b/setup.py
index 945385da..b1aafccb 100644
--- a/setup.py
+++ b/setup.py
@@ -20,15 +20,20 @@
 
 
 examples_require = [
-    'mlprimitives>=0.2.4.dev0',
-    'jupyter==1.0.0'
+    'matplotlib>=2.2.2,<3.2.2',
+    'mlprimitives>=0.2.5,<0.3',
+    'boto3>=1.14,<1.14.45',
+    'botocore<1.17.45,>=1.17.44',
+    'jupyter==1.0.0',
+    'docutils<0.16,>=0.10',
+    'baytune>=0.3.0,<0.4',
 ]
 
 
 tests_require = [
     'pytest>=3.4.2',
     'pytest-cov>=2.6.0',
-    'mlprimitives>=0.2.4.dev0',
+    'mlprimitives>=0.2,<0.3',
     'setuptools>=41.0.0',
     'numpy<1.17',
     'rundoc>=0.4.3',
@@ -43,34 +48,32 @@
 
 development_requires = [
     # general
-    'bumpversion>=0.5.3',
+    'bumpversion>=0.5.3,<0.6',
     'pip>=9.0.1',
-    'watchdog>=0.8.3',
+    'watchdog>=0.8.3,<0.11',
 
     # docs
     'm2r>=0.2.0,<0.3',
     'Sphinx>=1.7.1,<3',
-    'sphinx_rtd_theme>=0.2.4',
+    'sphinx_rtd_theme>=0.2.4,<0.5',
     'ipython>=6.5.0',
-    'matplotlib>=2.2.3',
     'autodocsumm>=0.1.10',
-    'docutils<0.15,>=0.10',    # botocore incompatibility with 0.15
 
     # style check
-    'flake8>=3.5.0,<3.8',
+    'flake8>=3.7.7,<4',
     'isort>=4.3.4,<5',
 
     # fix style issues
-    'autoflake>=1.2',  # keep this after flake8 to avoid
-    'autopep8>=1.3.5', # version incompatibilities with flake8
+    'autoflake>=1.1,<2',
+    'autopep8>=1.4.3,<2',
 
     # distribute on PyPI
-    'twine>=1.10.0',
+    'twine>=1.10.0,<4',
     'wheel>=0.30.0',
 
     # Advanced testing
-    'tox>=2.9.1',
-    'coverage>=4.5.1',
+    'coverage>=4.5.1,<6',
+    'tox>=2.9.1,<4',
 
     # Documentation style
     'doc8>=0.8.0',
@@ -93,7 +96,7 @@
     description="Pipelines and primitives for machine learning and data science.",
     extras_require={
         'dev': development_requires + tests_require + examples_require,
-        'test': tests_require,
+        'test': tests_require + examples_require,
         'examples': examples_require,
     },
     include_package_data=True,
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index ffdd8deb..59e11633 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -879,6 +879,7 @@ def test_predict_debug(self):
         for block_name, dictionary in expected_return.items():
             assert set(debug_returned[block_name].keys()) == set(dictionary.keys())
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_get_diagram_simple(self):
         f = open('tests/data/diagrams/diagram_simple.txt', 'r')
         expected = f.read()[:-1]
@@ -984,4 +985,4 @@ def test_from_dict(self):
         pass
 
     def test_load(self):
-        pass
\ No newline at end of file
+        pass
diff --git a/tox.ini b/tox.ini
index 1b8a777e..96d29dbe 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,37 +1,20 @@
 [tox]
-envlist = py35, py36, lint, docs, readme
-
+envlist = py3{5,6}, test-devel
 
 [travis]
 python =
-    3.6: py36, lint, docs
+    3.6: py36, test-devel
     3.5: py35
 
-
 [testenv]
 passenv = CI TRAVIS TRAVIS_*
-setenv =
-    PYTHONPATH = {toxinidir}
+skipsdist = false
+skip_install = false
 extras = test
 commands =
     /usr/bin/env make test
 
-
-[testenv:lint]
-skipsdist = true
-extras = dev
-commands =
-    /usr/bin/env make lint
-
-
-[testenv:docs]
-skipsdist = true
+[testenv:test-devel]
 extras = dev
 commands =
-    /usr/bin/env make docs
-
-
-[testenv:readme]
-skipsdist = true
-commands =
-    /usr/bin/env make test-readme
+    /usr/bin/env make test-devel

From 6ac5731d69c71533499fbac8ac90932289ebc1c7 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 16 Sep 2020 18:10:55 +0200
Subject: [PATCH 116/160] Add travis wait

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 7c63a880..97f4bcf8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,7 +12,7 @@ install:
   - pip install -U tox-travis codecov
 
 # Command to run tests
-script: tox
+script: travis_wait tox
 
 after_success: codecov
 

From 0fac3ce2cc2f4d4982982c52e63d1e9198a91896 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Wed, 16 Sep 2020 20:07:57 +0200
Subject: [PATCH 117/160] travis wait 60 min

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 97f4bcf8..51ac1dd8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,7 +12,7 @@ install:
   - pip install -U tox-travis codecov
 
 # Command to run tests
-script: travis_wait tox
+script: travis_wait 60 tox
 
 after_success: codecov
 

From f6bff86bb061a85789981bfbf0a0366c6cab7f95 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 19 Nov 2020 12:47:35 +0100
Subject: [PATCH 118/160] Remove unused datasets module

---
 mlblocks/datasets.py   | 447 -----------------------------------------
 tests/test_datasets.py |  58 ------
 2 files changed, 505 deletions(-)
 delete mode 100644 mlblocks/datasets.py
 delete mode 100644 tests/test_datasets.py

diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py
deleted file mode 100644
index 0c69afda..00000000
--- a/mlblocks/datasets.py
+++ /dev/null
@@ -1,447 +0,0 @@
-# -*- coding: utf-8 -*-
-
-"""
-Datasets module.
-
-This module contains functions that allow loading datasets for easy
-testing of pipelines and primitives over multiple data modalities
-and task types.
-
-The available datasets by data modality and task type are:
-
-+---------------+---------------+-------------------------+
-| Dataset       | Data Modality | Task Type               |
-+===============+===============+=========================+
-| Amazon        | Graph         | Community Detection     |
-+---------------+---------------+-------------------------+
-| DIC28         | Graph         | Graph Matching          |
-+---------------+---------------+-------------------------+
-| UMLs          | Graph         | Link Prediction         |
-+---------------+---------------+-------------------------+
-| Nomination    | Graph         | Vertex Nomination       |
-+---------------+---------------+-------------------------+
-| USPS          | Image         | Classification          |
-+---------------+---------------+-------------------------+
-| Hand Geometry | Image         | Regression              |
-+---------------+---------------+-------------------------+
-| Iris          | Single Table  | Classification          |
-+---------------+---------------+-------------------------+
-| Jester        | Single Table  | Collaborative Filtering |
-+---------------+---------------+-------------------------+
-| Boston        | Single Table  | Regression              |
-+---------------+---------------+-------------------------+
-| Wiki QA       | Multi Table   | Classification          |
-+---------------+---------------+-------------------------+
-| Personae      | Text          | Classification          |
-+---------------+---------------+-------------------------+
-| News Groups   | Text          | Classification          |
-+---------------+---------------+-------------------------+
-
-"""
-
-import io
-import logging
-import os
-import tarfile
-import urllib
-
-import networkx as nx
-import numpy as np
-import pandas as pd
-from keras.preprocessing.image import img_to_array, load_img
-from sklearn import datasets
-from sklearn.metrics import accuracy_score, normalized_mutual_info_score, r2_score
-from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
-
-LOGGER = logging.getLogger(__name__)
-
-INPUT_SHAPE = [224, 224, 3]
-
-DATA_PATH = os.path.join(
-    os.path.dirname(__file__),
-    'data'
-)
-DATA_URL = 'http://dai-mlblocks.s3.amazonaws.com/%7B%7D.tar.gz'
-
-
-class Dataset():
-    """Dataset class.
-
-    This class represents the abstraction of a dataset and works as
-    a container of all the things needed in order to use a dataset
-    for testing.
-
-    Among other things, it includes the actual dataset data, information
-    about its origin, a score function that works for this dataset,
-    and a method to split the data in multiple ways for goodnes-of-fit
-    evaluation.
-
-    Attributes:
-        name (str): Name of this dataset.
-        description (str): Short description about the data that composes this dataset.
-        data (array-like): Numpy array or pandas DataFrame containing all the data of
-            this dataset, excluding the labels or target values.
-        target (array-like): Numpy array or pandas Series containing the expected labels
-            or values
-        **kwargs: Any additional keyword argument passed on initailization is also
-            available as instance attributes.
-
-    Args:
-        description (str): Short description about the data that composes this dataset.
-            The first line of the description is expected to be a human friendly
-            name for the dataset, and will be set as the `name` attribute.
-        data (array-like): Numpy array or pandas DataFrame containing all the data of
-            this dataset, excluding the labels or target values.
-        target (array-like): Numpy array or pandas Series containing the expected labels
-            or values
-        score (callable): Function that will be used to compute the score of this dataset.
-        shuffle (bool): Whether or not to shuffle the data before splitting.
-        stratify (bool): Whther to use a stratified or regular KFold for splitting.
-        **kwargs: Any additional keyword argument passed on initialization will be made
-            available as instance attributes.
-    """
-
-    def __init__(self, description, data, target, score, shuffle=True, stratify=False, **kwargs):
-
-        self.name = description.splitlines()[0]
-        self.description = description
-
-        self.data = data
-        self.target = target
-
-        self._stratify = stratify
-        self._shuffle = shuffle
-        self._score = score
-
-        self.__dict__.update(kwargs)
-
-    def score(self, *args, **kwargs):
-        r"""Scoring function for this dataset.
-
-        Args:
-            \*args, \*\*kwargs: Any given arguments and keyword arguments will be
-            directly passed to the given scoring function.
-
-        Returns:
-            float:
-                The computed score.
-        """
-        return self._score(*args, **kwargs)
-
-    def __repr__(self):
-        return self.name
-
-    def describe(self):
-        """Print the description of this Dataset on stdout."""
-        print(self.description)
-
-    @staticmethod
-    def _get_split(data, index):
-        if hasattr(data, 'iloc'):
-            return data.iloc[index]
-        else:
-            return data[index]
-
-    def get_splits(self, n_splits=1, random_state=0):
-        """Return splits of this dataset ready for Cross Validation.
-
-        If n_splits is 1, a tuple containing the X for train and test
-        and the y for train and test is returned.
-        Otherwise, if n_splits is bigger than 1, a list of such tuples
-        is returned, one for each split.
-
-        Args:
-            n_splits (int): Number of times that the data needs to be splitted.
-
-        Returns:
-            tuple or list:
-                if n_splits is 1, a tuple containing the X for train and test
-                and the y for train and test is returned.
-                Otherwise, if n_splits is bigger than 1, a list of such tuples
-                is returned, one for each split.
-        """
-        if n_splits == 1:
-            stratify = self.target if self._stratify else None
-
-            return train_test_split(
-                self.data,
-                self.target,
-                shuffle=self._shuffle,
-                stratify=stratify,
-                random_state=random_state
-            )
-
-        else:
-            cv_class = StratifiedKFold if self._stratify else KFold
-            cv = cv_class(n_splits=n_splits, shuffle=self._shuffle, random_state=random_state)
-
-            splits = list()
-            for train, test in cv.split(self.data, self.target):
-                X_train = self._get_split(self.data, train)
-                y_train = self._get_split(self.target, train)
-                X_test = self._get_split(self.data, test)
-                y_test = self._get_split(self.target, test)
-                splits.append((X_train, X_test, y_train, y_test))
-
-            return splits
-
-
-def _download(dataset_name, dataset_path):
-    url = DATA_URL.format(dataset_name)
-
-    LOGGER.debug('Downloading dataset %s from %s', dataset_name, url)
-    response = urllib.request.urlopen(url)
-    bytes_io = io.BytesIO(response.read())
-
-    LOGGER.debug('Extracting dataset into %s', DATA_PATH)
-    with tarfile.open(fileobj=bytes_io, mode='r:gz') as tf:
-        tf.extractall(DATA_PATH)
-
-
-def _load(dataset_name):
-    if not os.path.exists(DATA_PATH):
-        os.makedirs(DATA_PATH)
-
-    dataset_path = os.path.join(DATA_PATH, dataset_name)
-    if not os.path.exists(dataset_path):
-        _download(dataset_name, dataset_path)
-
-    return dataset_path
-
-
-def _load_images(image_dir, filenames):
-    LOGGER.debug('Loading %s images from %s', len(filenames), image_dir)
-    images = []
-    for filename in filenames:
-        filename = os.path.join(image_dir, filename)
-
-        image = load_img(filename)
-        image = image.resize(tuple(INPUT_SHAPE[0:2]))
-        image = img_to_array(image)
-        image = image / 255.0  # Quantize images.
-        images.append(image)
-
-    return np.array(images)
-
-
-def _load_csv(dataset_path, name, set_index=False):
-    csv_path = os.path.join(dataset_path, name + '.csv')
-
-    LOGGER.debug('Loading csv %s', csv_path)
-    df = pd.read_csv(csv_path)
-
-    if set_index:
-        df = df.set_index(df.columns[0], drop=False)
-
-    return df
-
-
-def load_usps():
-    """USPs Digits Dataset.
-
-    The data of this dataset is a 3d numpy array vector with shape (224, 224, 3)
-    containing 9298 224x224 RGB photos of handwritten digits, and the target is
-    a 1d numpy integer array containing the label of the digit represented in
-    the image.
-    """
-    dataset_path = _load('usps')
-
-    df = _load_csv(dataset_path, 'data')
-    X = _load_images(os.path.join(dataset_path, 'images'), df.image)
-    y = df.label.values
-
-    return Dataset(load_usps.__doc__, X, y, accuracy_score, stratify=True)
-
-
-def load_handgeometry():
-    """Hand Geometry Dataset.
-
-    The data of this dataset is a 3d numpy array vector with shape (224, 224, 3)
-    containing 112 224x224 RGB photos of hands, and the target is a 1d numpy
-    float array containing the width of the wrist in centimeters.
-    """
-    dataset_path = _load('handgeometry')
-
-    df = _load_csv(dataset_path, 'data')
-    X = _load_images(os.path.join(dataset_path, 'images'), df.image)
-    y = df.target.values
-
-    return Dataset(load_handgeometry.__doc__, X, y, r2_score)
-
-
-def load_personae():
-    """Personae Dataset.
-
-    The data of this dataset is a 2d numpy array vector containing 145 entries
-    that include texts written by Dutch users in Twitter, with some additional
-    information about the author, and the target is a 1d numpy binary integer
-    array indicating whether the author was extrovert or not.
-    """
-    dataset_path = _load('personae')
-
-    X = _load_csv(dataset_path, 'data')
-    y = X.pop('label').values
-
-    return Dataset(load_personae.__doc__, X, y, accuracy_score, stratify=True)
-
-
-def load_umls():
-    """UMLs Dataset.
-
-    The data consists of information about a 135 Graph and the relations between
-    their nodes given as a DataFrame with three columns, source, target and type,
-    indicating which nodes are related and with which type of link. The target is
-    a 1d numpy binary integer array indicating whether the indicated link exists
-    or not.
-    """
-    dataset_path = _load('umls')
-
-    X = _load_csv(dataset_path, 'data')
-    y = X.pop('label').values
-
-    graph = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph.gml')))
-
-    return Dataset(load_umls.__doc__, X, y, accuracy_score, stratify=True, graph=graph)
-
-
-def load_dic28():
-    """DIC28 Dataset from Pajek.
-
-    This network represents connections among English words in a dictionary.
-    It was generated from Knuth's dictionary. Two words are connected by an
-    edge if we can reach one from the other by
-    - changing a single character (e. g., work - word)
-    - adding / removing a single character (e. g., ever - fever).
-
-    There exist 52,652 words (vertices in a network) having 2 up to 8 characters
-    in the dictionary. The obtained network has 89038 edges.
-    """
-    dataset_path = _load('dic28')
-
-    X = _load_csv(dataset_path, 'data')
-    y = X.pop('label').values
-
-    graph1 = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph1.gml')))
-    graph2 = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph2.gml')))
-
-    graph = graph1.copy()
-    graph.add_nodes_from(graph2.nodes(data=True))
-    graph.add_edges_from(graph2.edges)
-    graph.add_edges_from(X[['graph1', 'graph2']].values)
-
-    graphs = {
-        'graph1': graph1,
-        'graph2': graph2,
-    }
-
-    return Dataset(load_dic28.__doc__, X, y, accuracy_score,
-                   stratify=True, graph=graph, graphs=graphs)
-
-
-def load_nomination():
-    """Sample 1 of graph vertex nomination data from MII Lincoln Lab.
-
-    Data consists of one graph whose nodes contain two attributes, attr1 and attr2.
-    Associated with each node is a label that has to be learned and predicted.
-    """
-    dataset_path = _load('nomination')
-
-    X = _load_csv(dataset_path, 'data')
-    y = X.pop('label').values
-
-    graph = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph.gml')))
-
-    return Dataset(load_nomination.__doc__, X, y, accuracy_score, stratify=True, graph=graph)
-
-
-def load_amazon():
-    """Amazon product co-purchasing network and ground-truth communities.
-
-    Network was collected by crawling Amazon website. It is based on Customers Who Bought
-    This Item Also Bought feature of the Amazon website. If a product i is frequently
-    co-purchased with product j, the graph contains an undirected edge from i to j.
-    Each product category provided by Amazon defines each ground-truth community.
-    """
-    dataset_path = _load('amazon')
-
-    X = _load_csv(dataset_path, 'data')
-    y = X.pop('label').values
-
-    graph = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph.gml')))
-
-    return Dataset(load_amazon.__doc__, X, y, normalized_mutual_info_score, graph=graph)
-
-
-def load_jester():
-    """Ratings from the Jester Online Joke Recommender System.
-
-    This dataset consists of over 1.7 million instances of (user_id, item_id, rating)
-    triples, which is split 50-50 into train and test data.
-
-    source: "University of California Berkeley, CA"
-    sourceURI: "http://eigentaste.berkeley.edu/dataset/"
-    """
-    dataset_path = _load('jester')
-
-    X = _load_csv(dataset_path, 'data')
-    y = X.pop('rating').values
-
-    return Dataset(load_jester.__doc__, X, y, r2_score)
-
-
-def load_wikiqa():
-    """Challenge Dataset for Open-Domain Question Answering.
-
-    WikiQA dataset is a publicly available set of question and sentence (QS) pairs,
-    collected and annotated for research on open-domain question answering.
-
-    source: "Microsoft"
-    sourceURI: "https://www.microsoft.com/en-us/research/publication/wikiqa-a-challenge-dataset-for-open-domain-question-answering/#"
-    """  # noqa
-    dataset_path = _load('wikiqa')
-
-    data = _load_csv(dataset_path, 'data', set_index=True)
-    questions = _load_csv(dataset_path, 'questions', set_index=True)
-    sentences = _load_csv(dataset_path, 'sentences', set_index=True)
-    vocabulary = _load_csv(dataset_path, 'vocabulary', set_index=True)
-
-    entities = {
-        'data': (data, 'd3mIndex', None),
-        'questions': (questions, 'qIndex', None),
-        'sentences': (sentences, 'sIndex', None),
-        'vocabulary': (vocabulary, 'index', None)
-    }
-    relationships = [
-        ('questions', 'qIndex', 'data', 'qIndex'),
-        ('sentences', 'sIndex', 'data', 'sIndex')
-    ]
-
-    target = data.pop('isAnswer').values
-
-    return Dataset(load_wikiqa.__doc__, data, target, accuracy_score, startify=True,
-                   entities=entities, relationships=relationships)
-
-
-def load_newsgroups():
-    """20 News Groups Dataset.
-
-    The data of this dataset is a 1d numpy array vector containing the texts
-    from 11314 newsgroups posts, and the target is a 1d numpy integer array
-    containing the label of one of the 20 topics that they are about.
-    """
-    dataset = datasets.fetch_20newsgroups()
-    return Dataset(load_newsgroups.__doc__, np.array(dataset.data), dataset.target,
-                   accuracy_score, stratify=True)
-
-
-def load_iris():
-    """Iris Dataset."""
-    dataset = datasets.load_iris()
-    return Dataset(load_iris.__doc__, dataset.data, dataset.target,
-                   accuracy_score, stratify=True)
-
-
-def load_boston():
-    """Boston House Prices Dataset."""
-    dataset = datasets.load_boston()
-    return Dataset(load_boston.__doc__, dataset.data, dataset.target, r2_score)
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
deleted file mode 100644
index 174a85d6..00000000
--- a/tests/test_datasets.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from unittest import TestCase
-from unittest.mock import Mock
-
-from mlblocks import datasets
-
-
-class TestDataset(TestCase):
-
-    def setUp(self):
-        self.description = """Dataset Name.
-
-        Some extended description.
-        """
-        self.score = Mock()
-        self.score.return_value = 1.0
-
-        self.dataset = datasets.Dataset(
-            self.description, 'data', 'target', self.score,
-            shuffle=False, stratify=True, some='kwargs')
-
-    def test___init__(self):
-
-        assert self.dataset.name == 'Dataset Name.'
-        assert self.dataset.description == self.description
-        assert self.dataset.data == 'data'
-        assert self.dataset.target == 'target'
-        assert self.dataset._shuffle is False
-        assert self.dataset._stratify is True
-        assert self.dataset._score == self.score
-        assert self.dataset.some == 'kwargs'
-
-    def test_score(self):
-        returned = self.dataset.score('a', b='c')
-
-        assert returned == 1.0
-        self.score.assert_called_once_with('a', b='c')
-
-    def test___repr__(self):
-        repr_ = str(self.dataset)
-
-        assert repr_ == "Dataset Name."
-
-
-def test_dataset_describe(capsys):
-    """Tested here because fixtures are not supported in TestCases."""
-
-    description = """Dataset Name.
-
-    Some extended description.
-    """
-
-    dataset = datasets.Dataset(description, 'data', 'target', 'score')
-    dataset.describe()
-
-    captured = capsys.readouterr()
-    assert captured.out == description + '\n'

From 7afbe40e6006ab5c228df7e8f9ae3e3cc3ab1ce5 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Thu, 19 Nov 2020 12:48:42 +0100
Subject: [PATCH 119/160] Update python support and dependency ranges

---
 .travis.yml |  5 +++--
 README.md   |  2 +-
 setup.py    | 10 ++++++----
 tox.ini     |  7 ++++---
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 51ac1dd8..d2a982f2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,8 +2,9 @@
 dist: bionic
 language: python
 python:
+  - 3.8
+  - 3.7
   - 3.6
-  - 3.5
 
 # Command to install dependencies
 install:
@@ -26,4 +27,4 @@ deploy:
     target-branch: gh-pages
     on:
       branch: master
-      python: 3.6
+      python: 3.8
diff --git a/README.md b/README.md
index fa4260d5..127089ac 100644
--- a/README.md
+++ b/README.md
@@ -47,7 +47,7 @@ Features include:
 
 ## Requirements
 
-**MLBlocks** has been developed and tested on [Python 3.5 and 3.6](https://www.python.org/downloads/)
+**MLBlocks** has been developed and tested on [Python 3.6, 3.7 and 3.8](https://www.python.org/downloads/)
 
 Also, although it is not strictly required, the usage of a
 [virtualenv](https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid
diff --git a/setup.py b/setup.py
index b1aafccb..b07eccb6 100644
--- a/setup.py
+++ b/setup.py
@@ -16,12 +16,13 @@
 
 install_requires = [
     'graphviz>=0.9,<1',
+    'numpy>=1.17.1,<1.19',
 ]
 
 
 examples_require = [
     'matplotlib>=2.2.2,<3.2.2',
-    'mlprimitives>=0.2.5,<0.3',
+    'mlprimitives>=0.2.6.dev0,<0.3',
     'boto3>=1.14,<1.14.45',
     'botocore<1.17.45,>=1.17.44',
     'jupyter==1.0.0',
@@ -33,9 +34,8 @@
 tests_require = [
     'pytest>=3.4.2',
     'pytest-cov>=2.6.0',
-    'mlprimitives>=0.2,<0.3',
+    'mlprimitives>=0.2.6.dev0,<0.3',
     'setuptools>=41.0.0',
-    'numpy<1.17',
     'rundoc>=0.4.3',
     'prompt-toolkit>=2.0,<3.0',
 ]
@@ -90,8 +90,9 @@
         'License :: OSI Approved :: MIT License',
         'Natural Language :: English',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
     ],
     description="Pipelines and primitives for machine learning and data science.",
     extras_require={
@@ -107,6 +108,7 @@
     long_description_content_type='text/markdown',
     name='mlblocks',
     packages=find_packages(include=['mlblocks', 'mlblocks.*']),
+    python_requires='>=3.6,<3.9',
     setup_requires=setup_requires,
     test_suite='tests',
     tests_require=tests_require,
diff --git a/tox.ini b/tox.ini
index 96d29dbe..1bc3f81a 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,10 +1,11 @@
 [tox]
-envlist = py3{5,6}, test-devel
+envlist = py3{6,7,8}, test-devel
 
 [travis]
 python =
-    3.6: py36, test-devel
-    3.5: py35
+    3.8: py38, test-devel
+    3.7: py37
+    3.6: py36
 
 [testenv]
 passenv = CI TRAVIS TRAVIS_*

From cf419bd64b2f90aafd0e56df25325103c646e45e Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sun, 13 Dec 2020 18:16:46 +0100
Subject: [PATCH 120/160] Update baytune dependency

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b07eccb6..6ae0c75e 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@
     'botocore<1.17.45,>=1.17.44',
     'jupyter==1.0.0',
     'docutils<0.16,>=0.10',
-    'baytune>=0.3.0,<0.4',
+    'baytune>=0.3.13.dev0,<0.4',
 ]
 
 

From 68774a040ee489ee4abbb1e73acfe30ab556bfb2 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sun, 13 Dec 2020 18:31:08 +0100
Subject: [PATCH 121/160] Change links to mlbazaar

---
 CONTRIBUTING.rst                              |  8 +++---
 HISTORY.md                                    | 26 +++++++++----------
 README.md                                     | 22 ++++++++--------
 docs/advanced_usage/adding_primitives.rst     |  6 ++---
 docs/advanced_usage/hyperparameters.rst       |  4 +--
 docs/advanced_usage/primitives.rst            |  6 ++---
 docs/conf.py                                  |  2 +-
 docs/getting_started/install.rst              |  2 +-
 docs/getting_started/quickstart.rst           |  2 +-
 docs/index.rst                                |  6 ++---
 docs/pipeline_examples/graph.rst              |  2 +-
 docs/pipeline_examples/image.rst              |  2 +-
 docs/pipeline_examples/multi_table.rst        |  2 +-
 docs/pipeline_examples/text.rst               |  6 ++---
 examples/README.md                            |  6 ++---
 .... Setting MLPipeline Hyperparameters.ipynb |  2 +-
 examples/tutorials/7. Tuning a Pipeline.ipynb |  2 +-
 ...or the best pipeline with BTBSession.ipynb |  2 +-
 mlblocks/__init__.py                          |  2 +-
 setup.py                                      |  2 +-
 20 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index 4c01093e..43acf3a0 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -15,7 +15,7 @@ Types of Contributions
 Report Bugs
 ~~~~~~~~~~~
 
-Report bugs at https://github.com/HDI-Project/MLBlocks/issues.
+Report bugs at https://github.com/MLBazaar/MLBlocks/issues.
 
 If you are reporting a bug, please include:
 
@@ -45,7 +45,7 @@ articles, and such.
 Submit Feedback
 ~~~~~~~~~~~~~~~
 
-The best way to send feedback is to file an issue at https://github.com/HDI-Project/MLBlocks/issues.
+The best way to send feedback is to file an issue at https://github.com/MLBazaar/MLBlocks/issues.
 
 If you are proposing a feature:
 
@@ -120,8 +120,8 @@ Before you submit a pull request, check that it meets these guidelines:
 4. If the pull request adds functionality, the docs should be updated. Put
    your new functionality into a function with a docstring, and add the
    feature to the list in README.rst.
-5. The pull request should work for Python2.7, 3.4, 3.5 and 3.6. Check
-   https://travis-ci.org/HDI-Project/MLBlocks/pull_requests
+5. The pull request should work for all the supported python version. Check
+   https://travis-ci.org/MLBazaar/MLBlocks/pull_requests
    and make sure that all the checks pass.
 
 Unit Testing Guidelines
diff --git a/HISTORY.md b/HISTORY.md
index 5b5d4f0b..17bbda92 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -4,31 +4,31 @@ Changelog
 0.3.4 - 2019-11-01
 ------------------
 
-* Ability to return intermediate context - [Issue #110](https://github.com/HDI-Project/MLBlocks/issues/110) by @csala
-* Support for static or class methods - [Issue #107](https://github.com/HDI-Project/MLBlocks/issues/107) by @csala
+* Ability to return intermediate context - [Issue #110](https://github.com/MLBazaar/MLBlocks/issues/110) by @csala
+* Support for static or class methods - [Issue #107](https://github.com/MLBazaar/MLBlocks/issues/107) by @csala
 
 0.3.3 - 2019-09-09
 ------------------
 
-* Improved intermediate outputs management - [Issue #105](https://github.com/HDI-Project/MLBlocks/issues/105) by @csala
+* Improved intermediate outputs management - [Issue #105](https://github.com/MLBazaar/MLBlocks/issues/105) by @csala
 
 0.3.2 - 2019-08-12
 ------------------
 
-* Allow passing fit and produce arguments as `init_params` - [Issue #96](https://github.com/HDI-Project/MLBlocks/issues/96) by @csala
-* Support optional fit and produce args and arg defaults - [Issue #95](https://github.com/HDI-Project/MLBlocks/issues/95) by @csala
-* Isolate primitives from their hyperparameters dictionary - [Issue #94](https://github.com/HDI-Project/MLBlocks/issues/94) by @csala
-* Add functions to explore the available primitives and pipelines - [Issue #90](https://github.com/HDI-Project/MLBlocks/issues/90) by @csala
-* Add primitive caching - [Issue #22](https://github.com/HDI-Project/MLBlocks/issues/22) by @csala
+* Allow passing fit and produce arguments as `init_params` - [Issue #96](https://github.com/MLBazaar/MLBlocks/issues/96) by @csala
+* Support optional fit and produce args and arg defaults - [Issue #95](https://github.com/MLBazaar/MLBlocks/issues/95) by @csala
+* Isolate primitives from their hyperparameters dictionary - [Issue #94](https://github.com/MLBazaar/MLBlocks/issues/94) by @csala
+* Add functions to explore the available primitives and pipelines - [Issue #90](https://github.com/MLBazaar/MLBlocks/issues/90) by @csala
+* Add primitive caching - [Issue #22](https://github.com/MLBazaar/MLBlocks/issues/22) by @csala
 
 0.3.1 - Pipelines Discovery
 ---------------------------
 
-* Support flat hyperparameter dictionaries - [Issue #92](https://github.com/HDI-Project/MLBlocks/issues/92) by @csala
-* Load pipelines by name and register them as `entry_points` - [Issue #88](https://github.com/HDI-Project/MLBlocks/issues/88) by @csala
-* Implement partial re-fit -[Issue #61](https://github.com/HDI-Project/MLBlocks/issues/61) by @csala
-* Move argument parsing to MLBlock - [Issue #86](https://github.com/HDI-Project/MLBlocks/issues/86) by @csala
-* Allow getting intermediate outputs - [Issue #58](https://github.com/HDI-Project/MLBlocks/issues/58) by @csala
+* Support flat hyperparameter dictionaries - [Issue #92](https://github.com/MLBazaar/MLBlocks/issues/92) by @csala
+* Load pipelines by name and register them as `entry_points` - [Issue #88](https://github.com/MLBazaar/MLBlocks/issues/88) by @csala
+* Implement partial re-fit -[Issue #61](https://github.com/MLBazaar/MLBlocks/issues/61) by @csala
+* Move argument parsing to MLBlock - [Issue #86](https://github.com/MLBazaar/MLBlocks/issues/86) by @csala
+* Allow getting intermediate outputs - [Issue #58](https://github.com/MLBazaar/MLBlocks/issues/58) by @csala
 
 0.3.0 - New Primitives Discovery
 --------------------------------
diff --git a/README.md b/README.md
index 127089ac..770f34ef 100644
--- a/README.md
+++ b/README.md
@@ -13,18 +13,18 @@ Pipelines and Primitives for Machine Learning and Data Science.
 
 [![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
 [![PyPi](https://img.shields.io/pypi/v/mlblocks.svg)](https://pypi.python.org/pypi/mlblocks)
-[![Travis](https://travis-ci.org/HDI-Project/MLBlocks.svg?branch=master)](https://travis-ci.org/HDI-Project/MLBlocks)
-[![CodeCov](https://codecov.io/gh/HDI-Project/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/HDI-Project/MLBlocks)
+[![Travis](https://travis-ci.org/MLBazaar/MLBlocks.svg?branch=master)](https://travis-ci.org/MLBazaar/MLBlocks)
+[![CodeCov](https://codecov.io/gh/MLBazaar/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/MLBazaar/MLBlocks)
 [![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks)
 
 <br>
 
 # MLBlocks
 
-* Free software: [MIT license](https://github.com/HDI-Project/MLBlocks/blob/master/LICENSE)
+* Free software: [MIT license](https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE)
 * Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
-* Documentation: https://HDI-Project.github.io/MLBlocks
-* Homepage: https://github.com/HDI-Project/MLBlocks
+* Documentation: https://mlbazaar.github.io/MLBlocks
+* Homepage: https://github.com/MLBazaar/MLBlocks
 
 ## Overview
 
@@ -38,7 +38,7 @@ Features include:
   no python code to write, carefully curated by Machine Learning and Domain experts.
 * Extract machine-readable information about which hyperparameters can be tuned and within
   which ranges, allowing automated integration with Hyperparameter Optimization tools like
-  [BTB](https://github.com/HDI-Project/BTB).
+  [BTB](https://github.com/MLBazaar/BTB).
 * Complex multi-branch pipelines and DAG configurations, with unlimited number of inputs and
   outputs per primitive.
 * Easy save and load Pipelines using JSON Annotations.
@@ -65,14 +65,14 @@ pip install mlblocks
 This will pull and install the latest stable release from [PyPi](https://pypi.org/).
 
 If you want to install from source or contribute to the project please read the
-[Contributing Guide](https://hdi-project.github.io/MLBlocks/contributing.html#get-started).
+[Contributing Guide](https://mlbazaar.github.io/MLBlocks/contributing.html#get-started).
 
 ## MLPrimitives
 
 In order to be usable, MLBlocks requires a compatible primitives library.
 
 The official library, required in order to follow the following MLBlocks tutorial,
-is [MLPrimitives](https://github.com/HDI-Project/MLPrimitives), which you can install
+is [MLPrimitives](https://github.com/MLBazaar/MLPrimitives), which you can install
 with this command:
 
 ```bash
@@ -83,7 +83,7 @@ pip install mlprimitives
 
 Below there is a short example about how to use **MLBlocks** to solve the [Adult Census
 Dataset](https://archive.ics.uci.edu/ml/datasets/Adult) classification problem using a
-pipeline which combines primitives from [MLPrimitives](https://github.com/HDI-Project/MLPrimitives),
+pipeline which combines primitives from [MLPrimitives](https://github.com/MLBazaar/MLPrimitives),
 [scikit-learn](https://scikit-learn.org/) and [xgboost](https://xgboost.readthedocs.io/).
 
 ```python3
@@ -112,10 +112,10 @@ dataset.score(y_test, predictions)
 
 If you want to learn more about how to tune the pipeline hyperparameters, save and load
 the pipelines using JSON annotations or build complex multi-branched pipelines, please
-check our [documentation site](https://HDI-Project.github.io/MLBlocks).
+check our [documentation site](https://mlbazaar.github.io/MLBlocks).
 
 Also do not forget to have a look at the [notebook tutorials](
-https://github.com/HDI-Project/MLBlocks/tree/master/examples/tutorials)!
+https://github.com/MLBazaar/MLBlocks/tree/master/examples/tutorials)!
 
 # Citing MLBlocks
 
diff --git a/docs/advanced_usage/adding_primitives.rst b/docs/advanced_usage/adding_primitives.rst
index 9d358629..5ad0b60b 100644
--- a/docs/advanced_usage/adding_primitives.rst
+++ b/docs/advanced_usage/adding_primitives.rst
@@ -17,8 +17,8 @@ This can be achieved by running the commands::
 
 For further details, please refer to the `MLPrimitives Documentation`_.
 
-.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives
-.. _MLPrimitives Documentation: https://hdi-project.github.io/MLPrimitives/
+.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives
+.. _MLPrimitives Documentation: https://mlbazaar.github.io/MLPrimitives/
 
 Writing Primitives
 ------------------
@@ -27,7 +27,7 @@ Sometimes you will find that you want to use a primitive that is not in the list
 `MLPrimitives integrated primitives`_, so you will have to integrate the primitive yourself
 by writing the corresponding `JSON annotation <primitives.html#json-annotations>`_.
 
-.. _MLPrimitives integrated primitives: https://github.com/HDI-Project/MLPrimitives/tree/master/mlblocks_primitives
+.. _MLPrimitives integrated primitives: https://github.com/MLBazaar/MLPrimitives/tree/master/mlblocks_primitives
 
 .. note:: If you create new primitives for MLBlocks, please consider contributing them to the
           **MLPrimitives** project!
diff --git a/docs/advanced_usage/hyperparameters.rst b/docs/advanced_usage/hyperparameters.rst
index 71686ac5..488be9a9 100644
--- a/docs/advanced_usage/hyperparameters.rst
+++ b/docs/advanced_usage/hyperparameters.rst
@@ -221,8 +221,8 @@ In this case, the hyperparameters would be annotated like this::
           of type, range and default value as a nested dictionary to be used by default.
 
 .. _JSON Annotations: primitives.html#json-annotations
-.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives
-.. _BTB: https://github.com/HDI-Project/BTB
+.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives
+.. _BTB: https://github.com/MLBazaar/BTB
 .. _MLPipeline: ../api_reference.html#mlblocks.MLPipeline
 .. _multitype: #multitype-hyperparameters
 .. _conditional: #conditional-hyperparameters
diff --git a/docs/advanced_usage/primitives.rst b/docs/advanced_usage/primitives.rst
index 58847bbe..37df9031 100644
--- a/docs/advanced_usage/primitives.rst
+++ b/docs/advanced_usage/primitives.rst
@@ -311,11 +311,11 @@ For a more detailed description of this class, please check the corresponding
 section in the `API Reference`_ documentation.
 
 .. _API Reference: ../api_reference.html
-.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives
-.. _keras.preprocessing.text.Tokenizer: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/keras.preprocessing.text.Tokenizer.json
+.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives
+.. _keras.preprocessing.text.Tokenizer: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.preprocessing.text.Tokenizer.json
 .. _hyperparameters: hyperparameters.html
 .. _mlblocks.MLBlock: ../api_reference.html#mlblocks.MLBlock
 .. _pipelines: pipelines.html
-.. _examples folder: https://github.com/HDI-Project/MLBlocks/tree/master/examples
+.. _examples folder: https://github.com/MLBazaar/MLBlocks/tree/master/examples
 .. _fit: ../api_reference.html#mlblocks.MLBlock.fit
 .. _produce: ../api_reference.html#mlblocks.MLBlock.produce
diff --git a/docs/conf.py b/docs/conf.py
index 5ff266d0..f81b7b7e 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -63,7 +63,7 @@
 copyright = '2018, MIT Data To AI Lab'
 author = 'MIT Data To AI Lab'
 description = 'Pipelines and Primitives for Machine Learning and Data Science.'
-user = 'HDI-Project'
+user = 'MLBazaar'
 
 # The version info for the project you're documenting, acts as replacement
 # for |version| and |release|, also used in various other places throughout
diff --git a/docs/getting_started/install.rst b/docs/getting_started/install.rst
index d2bda921..d64970a2 100644
--- a/docs/getting_started/install.rst
+++ b/docs/getting_started/install.rst
@@ -30,7 +30,7 @@ is `MLPrimitives`_, which you can install with this command:
 
     pip install mlprimitives
 
-.. _MLPrimitives: https://github.com/HDI-Project/MLPrimitives
+.. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives
 
 Install for development
 -----------------------
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
index b55223dd..386752dc 100644
--- a/docs/getting_started/quickstart.rst
+++ b/docs/getting_started/quickstart.rst
@@ -123,5 +123,5 @@ to obtain predictions from the pipeline.
 .. _hyperparameters: ../advanced_usage/hyperparameters.html
 .. _MLBlocks JSON Annotations: ../advanced_usage/primitives.html#json-annotations
 .. _get_tunable_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_tunable_hyperparameters
-.. _BTB: https://github.com/HDI-Project/BTB
+.. _BTB: https://github.com/MLBazaar/BTB
 .. _set_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.set_hyperparameters
diff --git a/docs/index.rst b/docs/index.rst
index e891230c..85717469 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -6,9 +6,9 @@ What is MLBlocks?
    :alt: MLBlocks
    :align: center
 
-* Free software: `MIT license <https://github.com/HDI-Project/MLBlocks/blob/master/LICENSE>`_
-* Documentation: https://HDI-Project.github.io/MLBlocks
-* Homepage: https://github.com/HDI-Project/MLBlocks
+* Free software: `MIT license <https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE>`_
+* Documentation: https://mlbazaar.github.io/MLBlocks
+* Homepage: https://github.com/MLBazaar/MLBlocks
 
 MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning
 tools developed in Python, whether they are custom developments or belong to third party
diff --git a/docs/pipeline_examples/graph.rst b/docs/pipeline_examples/graph.rst
index 54ef85a1..8cde5340 100644
--- a/docs/pipeline_examples/graph.rst
+++ b/docs/pipeline_examples/graph.rst
@@ -69,6 +69,6 @@ additional information not found inside `X`.
 
 
 .. _NetworkX Link Prediction: https://networkx.github.io/documentation/networkx-1.10/reference/algorithms.link_prediction.html
-.. _CategoricalEncoder from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json
+.. _CategoricalEncoder from MLPrimitives: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json
 .. _StandardScaler from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
 .. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
diff --git a/docs/pipeline_examples/image.rst b/docs/pipeline_examples/image.rst
index e8274761..b9b97ef7 100644
--- a/docs/pipeline_examples/image.rst
+++ b/docs/pipeline_examples/image.rst
@@ -136,7 +136,7 @@ to an `XGBRegressor`_ primitive.
 
 .. _USPS Dataset: https://ieeexplore.ieee.org/document/291440/
 .. _OpenCV GaussianBlur function: https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html?highlight=gaussianblur#gaussianblur
-.. _MLPrimitives primitive: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json
+.. _MLPrimitives primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json
 .. _scikit-image function: http://scikit-image.org/docs/dev/api/skimage.feature.html#skimage.feature.hog
 .. _RandomForestClassifier from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
 .. _Pretrained Networks from Keras: https://keras.io/applications/
diff --git a/docs/pipeline_examples/multi_table.rst b/docs/pipeline_examples/multi_table.rst
index 109f4015..c2c2066f 100644
--- a/docs/pipeline_examples/multi_table.rst
+++ b/docs/pipeline_examples/multi_table.rst
@@ -49,5 +49,5 @@ tables are.
 
 .. _WikiQA dataset: https://www.microsoft.com/en-us/research/publication/wikiqa-a-challenge-dataset-for-open-domain-question-answering/
 .. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
-.. _DeepFeatureSynthesis: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/featuretools.dfs.json
+.. _DeepFeatureSynthesis: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/featuretools.dfs.json
 .. _featuretools: https://www.featuretools.com/
diff --git a/docs/pipeline_examples/text.rst b/docs/pipeline_examples/text.rst
index 03472ea3..ee0c16ac 100644
--- a/docs/pipeline_examples/text.rst
+++ b/docs/pipeline_examples/text.rst
@@ -140,9 +140,9 @@ to encode all the string features, and go directly into the
 
 
 .. _Twenty Newsgroups Dataset: http://scikit-learn.org/stable/datasets/twenty_newsgroups.html
-.. _TextCleaner primitive: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/text.py
-.. _StringVectorizer primitive: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/feature_extraction.py
+.. _TextCleaner primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlprimitives/text.py
+.. _StringVectorizer primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlprimitives/feature_extraction.py
 .. _keras text preprocessing: https://keras.io/preprocessing/text/
-.. _Keras LSTM Classifier from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.LSTMTextClassifier.json
+.. _Keras LSTM Classifier from MLPrimitives: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.LSTMTextClassifier.json
 .. _Personae Dataset: https://www.clips.uantwerpen.be/datasets/personae-corpus
 .. _RandomForestClassifier from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
diff --git a/examples/README.md b/examples/README.md
index d295414e..de298ef2 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -26,7 +26,7 @@ In order to run these tutorials on your computer, please follow these steps:
 1. Clone this github repository:
 
 ```bash
-git clone git@github.com:HDI-Project/MLBlocks.git
+git clone git@github.com:MLBazaar/MLBlocks.git
 ```
 
 2. (Optional) Create a virtualenv to execute the examples in an environment isolated from the
@@ -45,8 +45,8 @@ cd MLBlocks
 make install-examples
 ```
 
-This will install [MLBLocks](https://github.com/HDI-Project/MLBlocks.git) as well as [MLPrimitives](
-https://github.com/HDI-Project/MLPrimitives.git) and [Jupyter](https://jupyter.org/).
+This will install [MLBLocks](https://github.com/MLBazaar/MLBlocks.git) as well as [MLPrimitives](
+https://github.com/MLBazaar/MLPrimitives.git) and [Jupyter](https://jupyter.org/).
 
 4. Enter the `examples` folder and start a Jupyter Notebook:
 
diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
index 5b7944b5..4993fd4e 100644
--- a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb	
+++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb	
@@ -122,7 +122,7 @@
     "\n",
     "**NOTE** that here we see the names of the pipeline steps, which are the primitive names with a numerical suffix that allows us to tell the difference between multiple steps that use the same primitive. \n",
     "\n",
-    "Alternatively, for better compatibility with tuning systems like [BTB](https://github.com/HDI-Project/BTB)\n",
+    "Alternatively, for better compatibility with tuning systems like [BTB](https://github.com/MLBazaar/BTB)\n",
     "that work with flat, one-level, dictionaries, the argument `flat=True` can be passed."
    ]
   },
diff --git a/examples/tutorials/7. Tuning a Pipeline.ipynb b/examples/tutorials/7. Tuning a Pipeline.ipynb
index 4b6eae24..ca30df17 100644
--- a/examples/tutorials/7. Tuning a Pipeline.ipynb	
+++ b/examples/tutorials/7. Tuning a Pipeline.ipynb	
@@ -6,7 +6,7 @@
    "source": [
     "# Tuning a Pipeline\n",
     "\n",
-    "This short guide shows how tune a Pipeline using a [BTB](https://github.com/HDI-Project/BTB) Tuner.\n",
+    "This short guide shows how tune a Pipeline using a [BTB](https://github.com/MLBazaar/BTB) Tuner.\n",
     "\n",
     "Note that some steps are not explained for simplicity. Full details\n",
     "about them can be found in the previous parts of the tutorial.\n",
diff --git a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
index 1fb4d7ca..829a38d6 100644
--- a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb	
+++ b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb	
@@ -7,7 +7,7 @@
     "# Selecting and Tuning pipelines\n",
     "\n",
     "This guide shows you how to search for multiple pipelines for your problem\n",
-    "and later on use a [BTBSession](https://hdi-project.github.io/BTB/api/btb.session.html#btb.session.BTBSession)\n",
+    "and later on use a [BTBSession](https://mlbazaar.github.io/BTB/api/btb.session.html#btb.session.BTBSession)\n",
     "to select and tune the best one.\n",
     "\n",
     "Note that some steps are not explained for simplicity. Full details\n",
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 618e7a55..300b9093 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -7,7 +7,7 @@
 seamlessly combining tools from any python library with a simple, common and uniform interface.
 
 * Free software: MIT license
-* Documentation: https://HDI-Project.github.io/MLBlocks
+* Documentation: https://MLBazaar.github.io/MLBlocks
 """
 
 from mlblocks.discovery import (
diff --git a/setup.py b/setup.py
index 6ae0c75e..0c67cc8d 100644
--- a/setup.py
+++ b/setup.py
@@ -112,7 +112,7 @@
     setup_requires=setup_requires,
     test_suite='tests',
     tests_require=tests_require,
-    url='https://github.com/HDI-Project/MLBlocks',
+    url='https://github.com/MLBazaar/MLBlocks',
     version='0.3.5.dev0',
     zip_safe=False,
 )

From b9a6142e77b50eae9ae1a3aad6eae8dc1e1f6e70 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sun, 13 Dec 2020 18:46:15 +0100
Subject: [PATCH 122/160] Prevent travis-ci conflict

---
 setup.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0c67cc8d..4ff3a675 100644
--- a/setup.py
+++ b/setup.py
@@ -77,7 +77,10 @@
 
     # Documentation style
     'doc8>=0.8.0',
-    'pydocstyle>=3.0.0'
+    'pydocstyle>=3.0.0',
+
+    # Prevent travis-ci conflict
+    'chardet<4',
 ]
 
 

From 52653e072a17986da77c666fc5f2a73895f4b40b Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Mon, 14 Dec 2020 14:03:35 +0100
Subject: [PATCH 123/160] Update Travis badge

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 770f34ef..103fc113 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ Pipelines and Primitives for Machine Learning and Data Science.
 
 [![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
 [![PyPi](https://img.shields.io/pypi/v/mlblocks.svg)](https://pypi.python.org/pypi/mlblocks)
-[![Travis](https://travis-ci.org/MLBazaar/MLBlocks.svg?branch=master)](https://travis-ci.org/MLBazaar/MLBlocks)
+[![Travis](https://travis-ci.com/MLBazaar/MLBlocks.svg?branch=master)](https://travis-ci.com/MLBazaar/MLBlocks)
 [![CodeCov](https://codecov.io/gh/MLBazaar/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/MLBazaar/MLBlocks)
 [![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks)
 

From c5f3fdfc3de21fffe0053c00fd7d6279243126a9 Mon Sep 17 00:00:00 2001
From: Plamen Valentinov Kolev
 <41479552+pvk-developer@users.noreply.github.com>
Date: Tue, 22 Dec 2020 15:23:57 +0100
Subject: [PATCH 124/160] Add memory debug and profile (#130)

* Add code for memory consumption and optionally select which debug you would like to use.

* Add documentation about debuging

* Add psutil

* Tests updates

* Fix lint

* Add extra tests

* Update MLPrimitives version

* Rephrase documentation
---
 docs/advanced_usage/pipelines.rst |  36 ++++++
 mlblocks/mlpipeline.py            | 189 ++++++++++++++++++------------
 setup.py                          |   5 +-
 tests/test_mlpipeline.py          | 184 ++++++++++++++++++++++-------
 4 files changed, 294 insertions(+), 120 deletions(-)

diff --git a/docs/advanced_usage/pipelines.rst b/docs/advanced_usage/pipelines.rst
index e87a0067..07b36c98 100644
--- a/docs/advanced_usage/pipelines.rst
+++ b/docs/advanced_usage/pipelines.rst
@@ -423,6 +423,42 @@ An example of this situation, where we want to reuse the output of the first blo
     predictions = pipeline.predict(X_test)
     score = compute_score(y_test, predictions)
 
+Pipeline debugging
+------------------
+
+Sometimes we might be interested in debugging a pipeline execution and obtain information
+about the time, the memory usage, the inputs and outputs that each step takes. This is possible
+by using the argument ``debug`` with the method ``fit`` and ``predict``. This argument allows us
+to retrieve critical information from the pipeline execution:
+
+* ``Time``: Elapsed time for the primitive and the given stage (fit or predict).
+* ``Memory``: Amount of memory increase or decrease for the given primitive for that pipeline.
+* ``Input``: The input values that the primitive takes for that specific step.
+* ``Output``: The output produced by the primitive.
+
+
+If the ``debug`` argument is set to ``True`` then a dictionary will be returned containing all the
+elements listed previously::
+
+    result, debug_info = pipeline.fit(X_train, y_train, debug=True)
+
+In case you want to retrieve only some of the elements listed above and skip the rest, you can
+pass an ``str`` to the ``debug`` argument with any combination of the following characters:
+
+* ``i``: To include inputs.
+* ``o``: To include outputs.
+* ``m``: To include used memory.
+* ``t``: To include elapsed time.
+
+For example, if we are only interested on capturing the elapsed time and used memory during the
+``fit`` process, we can call the method as follows::
+
+    result, debug_info = pipeline.fit(X_train, y_train, debug='tm')
+
+.. warning:: Bear in mind that if we use ``debug=True`` or saving the ``Input`` and ``Output``,
+             this will consume extra memory ram as it will create copies of the input data and
+             the output data for each primitive. For profiling it is recommended using the option
+             ``tm`` as shown in the previous example.
 
 .. _API Reference: ../api_reference.html
 .. _primitives: ../primitives.html
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index 6e0744bd..a4111bcb 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -4,6 +4,7 @@
 
 import json
 import logging
+import os
 import re
 import warnings
 from collections import Counter, OrderedDict, defaultdict
@@ -11,6 +12,7 @@
 from datetime import datetime
 
 import numpy as np
+import psutil
 from graphviz import Digraph
 
 from mlblocks.discovery import load_pipeline
@@ -110,14 +112,14 @@ def _build_blocks(self):
                 if not block_params:
                     block_params = self.init_params.get(primitive_name, dict())
                     if block_params and block_count > 1:
-                        LOGGER.warning(("Non-numbered init_params are being used "
-                                        "for more than one block %s."), primitive_name)
+                        LOGGER.warning(('Non-numbered init_params are being used '
+                                        'for more than one block %s.'), primitive_name)
 
                 block = MLBlock(primitive, **block_params)
                 blocks[block_name] = block
 
             except Exception:
-                LOGGER.exception("Exception caught building MLBlock %s", primitive)
+                LOGGER.exception('Exception caught building MLBlock %s', primitive)
                 raise
 
         return blocks
@@ -475,8 +477,8 @@ def _sanitize(cls, hyperparameters):
         is a dict containing a complete hyperparameter specification for that block::
 
             {
-                "block_name": {
-                    "hyperparameter_name": "hyperparameter_value",
+                'block_name': {
+                    'hyperparameter_name': 'hyperparameter_value',
                     ...
                 },
                 ...
@@ -487,7 +489,7 @@ def _sanitize(cls, hyperparameters):
         second one::
 
             {
-                ("block_name", "hyperparameter_name"): "hyperparameter_value",
+                ('block_name', 'hyperparameter_name'): 'hyperparameter_value',
                 ...
             }
 
@@ -611,39 +613,52 @@ def _update_outputs(self, variable_name, output_variables, outputs, value):
             index = output_variables.index(variable_name)
             outputs[index] = deepcopy(value)
 
-    def _fit_block(self, block, block_name, context, debug=None):
+    def _fit_block(self, block, block_name, context, debug_info=None):
         """Get the block args from the context and fit the block."""
-        LOGGER.debug("Fitting block %s", block_name)
+        LOGGER.debug('Fitting block %s', block_name)
         try:
             fit_args = self._get_block_args(block_name, block.fit_args, context)
+            process = psutil.Process(os.getpid())
+            memory_before = process.memory_info().rss
             start = datetime.utcnow()
             block.fit(**fit_args)
             elapsed = datetime.utcnow() - start
+            memory_after = process.memory_info().rss
 
-            if debug is not None:
-                debug["fit"][block_name] = {
-                    "elapsed": elapsed.total_seconds(),
-                    "input": fit_args
-                }
+            if debug_info is not None:
+                debug = debug_info['debug']
+                record = {}
+                if 't' in debug:
+                    record['time'] = elapsed.total_seconds()
+                if 'm' in debug:
+                    record['memory'] = memory_after - memory_before
+                if 'i' in debug:
+                    record['input'] = deepcopy(fit_args)
+
+                debug_info['fit'][block_name] = record
 
         except Exception:
             if self.verbose:
-                LOGGER.exception("Exception caught fitting MLBlock %s", block_name)
+                LOGGER.exception('Exception caught fitting MLBlock %s', block_name)
 
             raise
 
-    def _produce_block(self, block, block_name, context, output_variables, outputs, debug=None):
+    def _produce_block(self, block, block_name, context, output_variables,
+                       outputs, debug_info=None):
         """Get the block args from the context and produce the block.
 
         Afterwards, set the block outputs back into the context and update
         the outputs list if necessary.
         """
-        LOGGER.debug("Producing block %s", block_name)
+        LOGGER.debug('Producing block %s', block_name)
         try:
             produce_args = self._get_block_args(block_name, block.produce_args, context)
+            process = psutil.Process(os.getpid())
+            memory_before = process.memory_info().rss
             start = datetime.utcnow()
             block_outputs = block.produce(**produce_args)
             elapsed = datetime.utcnow() - start
+            memory_after = process.memory_info().rss
 
             outputs_dict = self._extract_outputs(block_name, block_outputs, block.produce_output)
             context.update(outputs_dict)
@@ -656,21 +671,23 @@ def _produce_block(self, block, block_name, context, output_variables, outputs,
                         variable_name = '{}.{}'.format(block_name, key)
                         self._update_outputs(variable_name, output_variables, outputs, value)
 
-            if debug is not None:
-                record = {
-                    "elapsed": elapsed.total_seconds(),
-                    "input": produce_args,
-                    "output": outputs_dict
-                }
+            if debug_info is not None:
+                debug = debug_info['debug']
+                record = {}
+                if 't' in debug:
+                    record['time'] = elapsed.total_seconds()
+                if 'm' in debug:
+                    record['memory'] = memory_after - memory_before
+                if 'i' in debug:
+                    record['input'] = deepcopy(produce_args)
+                if 'o' in debug:
+                    record['output'] = deepcopy(outputs_dict)
 
-                if "fit" in debug.keys():
-                    debug["produce"][block_name] = record
-                else:
-                    debug[block_name] = record
+                debug_info['produce'][block_name] = record
 
         except Exception:
             if self.verbose:
-                LOGGER.exception("Exception caught producing MLBlock %s", block_name)
+                LOGGER.exception('Exception caught producing MLBlock %s', block_name)
 
             raise
 
@@ -692,21 +709,31 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
             y:
                 Fit Data labels, which the pipeline will use to learn how to
                 behave.
-
             output_ (str or int or list or None):
                 Output specification, as required by ``get_outputs``. If ``None`` is given,
                 nothing will be returned.
-
             start_ (str or int or None):
                 Block index or block name to start processing from. The
                 value can either be an integer, which will be interpreted as a block index,
                 or the name of a block, including the conter number at the end.
                 If given, the execution of the pipeline will start on the specified block,
                 and all the blocks before that one will be skipped.
-
-            debug (boolean):
-                Debug mode, if True a dictionary containing the block names as keys and
-                the execution time in seconds, input, output as values is returned.
+            debug (bool or str):
+                Debug a pipeline with the following options:
+
+                    * ``t``:
+                        Elapsed time for the primitive and the given stage (fit or predict).
+                    * ``m``:
+                        Amount of memory incrase (or decrease) for the primitive. This amount
+                        is represented in bytes.
+                    * ``i``:
+                        The input values that the primitive takes for that step.
+                    * ``o``:
+                        The output values that the primitive generates.
+
+                If provided, return a dictionary with the ``fit`` and ``predict`` performance.
+                This argument can be a string containing a combination of the letters listed above,
+                or ``True`` which will return a complete debug.
 
             **kwargs:
                 Any additional keyword arguments will be directly added
@@ -738,13 +765,14 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
         debug_info = None
         if debug:
             debug_info = defaultdict(dict)
+            debug_info['debug'] = debug.lower() if isinstance(debug, str) else 'tmio'
 
         for block_name, block in self.blocks.items():
             if start_:
                 if block_name == start_:
                     start_ = False
                 else:
-                    LOGGER.debug("Skipping block %s fit", block_name)
+                    LOGGER.debug('Skipping block %s fit', block_name)
                     continue
 
             self._fit_block(block, block_name, context, debug_info)
@@ -770,13 +798,13 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
 
                 return result
 
-        if debug:
-            return debug_info
-
         if start_:
             # We skipped all the blocks up to the end
             raise ValueError('Unknown block name: {}'.format(start_))
 
+        if debug:
+            return debug_info
+
     def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs):
         """Produce predictions using the blocks of this pipeline.
 
@@ -791,21 +819,31 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs)
         Args:
             X:
                 Data which the pipeline will use to make predictions.
-
             output_ (str or int or list or None):
                 Output specification, as required by ``get_outputs``. If not specified
                 the ``default`` output will be returned.
-
             start_ (str or int or None):
                 Block index or block name to start processing from. The
                 value can either be an integer, which will be interpreted as a block index,
                 or the name of a block, including the conter number at the end.
                 If given, the execution of the pipeline will start on the specified block,
                 and all the blocks before that one will be skipped.
-
-            debug (boolean):
-                Debug mode, if True a dictionary containing the block names as keys and
-                the execution time in seconds, input, output as values is returned.
+            debug (bool or str):
+                Debug a pipeline with the following options:
+
+                    * ``t``:
+                        Elapsed time for the primitive and the given stage (fit or predict).
+                    * ``m``:
+                        Amount of memory incrase (or decrease) for the primitive. This amount
+                        is represented in bytes.
+                    * ``i``:
+                        The input values that the primitive takes for that step.
+                    * ``o``:
+                        The output values that the primitive generates.
+
+                If ``True`` then a dictionary will be returned containing all the elements listed
+                previously. If a ``string`` value with the combination of letters is given for
+                each option, it will return a dictionary with the selected elements.
 
             **kwargs:
                 Any additional keyword arguments will be directly added
@@ -815,6 +853,9 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs)
             object or tuple:
                 * If a single output is requested, it is returned alone.
                 * If multiple outputs have been requested, a tuple is returned.
+                * If ``debug`` is given, a tupple will be returned where the first element
+                  returned are the predictions and the second a dictionary containing the debug
+                  information.
         """
         context = kwargs.copy()
         if X is not None:
@@ -827,14 +868,15 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs)
 
         debug_info = None
         if debug:
-            debug_info = dict()
+            debug_info = defaultdict(dict)
+            debug_info['debug'] = debug.lower() if isinstance(debug, str) else 'tmio'
 
         for block_name, block in self.blocks.items():
             if start_:
                 if block_name == start_:
                     start_ = False
                 else:
-                    LOGGER.debug("Skipping block %s produce", block_name)
+                    LOGGER.debug('Skipping block %s produce', block_name)
                     continue
 
             self._produce_block(block, block_name, context, output_variables, outputs, debug_info)
@@ -856,9 +898,6 @@ def predict(self, X=None, output_='default', start_=None, debug=False, **kwargs)
 
                 return result
 
-        if debug:
-            return debug_info
-
         if start_:
             # We skipped all the blocks up to the end
             raise ValueError('Unknown block name: {}'.format(start_))
@@ -871,32 +910,32 @@ def to_dict(self):
         specification of the tunable_hyperparameters::
 
             {
-                "primitives": [
-                    "a_primitive",
-                    "another_primitive"
+                'primitives': [
+                    'a_primitive',
+                    'another_primitive'
                 ],
-                "init_params": {
-                    "a_primitive": {
-                        "an_argument": "a_value"
+                'init_params': {
+                    'a_primitive': {
+                        'an_argument': 'a_value'
                     }
                 },
-                "hyperparameters": {
-                    "a_primitive#1": {
-                        "an_argument": "a_value",
-                        "another_argument": "another_value",
+                'hyperparameters': {
+                    'a_primitive#1': {
+                        'an_argument': 'a_value',
+                        'another_argument': 'another_value',
                     },
-                    "another_primitive#1": {
-                        "yet_another_argument": "yet_another_value"
+                    'another_primitive#1': {
+                        'yet_another_argument': 'yet_another_value'
                      }
                 },
-                "tunable_hyperparameters": {
-                    "another_primitive#1": {
-                        "yet_another_argument": {
-                            "type": "str",
-                            "default": "a_default_value",
-                            "values": [
-                                "a_default_value",
-                                "yet_another_value"
+                'tunable_hyperparameters': {
+                    'another_primitive#1': {
+                        'yet_another_argument': {
+                            'type': 'str',
+                            'default': 'a_default_value',
+                            'values': [
+                                'a_default_value',
+                                'yet_another_value'
                             ]
                         }
                     }
@@ -926,8 +965,8 @@ def _get_simple_block_name(self, block_name):
             str:
                 block name stripped of number and other modifiers.
         """
-        full_name = block_name.split("#")[0]
-        simple_name = full_name.split(".")[-1]
+        full_name = block_name.split('#')[0]
+        simple_name = full_name.split('.')[-1]
         return simple_name
 
     def _get_context_name_from_variable(self, variable_name):
@@ -942,12 +981,12 @@ def _get_context_name_from_variable(self, variable_name):
             str:
                 Name of the context of the variable.
         """
-        block_name = variable_name.split("#")[0]
+        block_name = variable_name.split('#')[0]
         rest = variable_name[len(block_name) + 1:]
-        block_index = rest.split(".")[0]
+        block_index = rest.split('.')[0]
         context_name = rest[len(block_index) + 1:]
         if len(context_name) == 0:
-            raise ValueError("Invalid variable name")
+            raise ValueError('Invalid variable name')
         return context_name
 
     def _get_relevant_output_variables(self, block_name, block, current_output_variables):
@@ -1107,7 +1146,7 @@ def _make_diagram_inputs(self, diagram, input_variables_blocks):
                 Dictionary of input variables of the pipeline and the set of tuples of blocks into
                 which the variable connects and the type of arrowhead to use
         """
-        with diagram.subgraph(name="cluster_inputs") as cluster:
+        with diagram.subgraph(name='cluster_inputs') as cluster:
             cluster.attr(tooltip='Input variables')
             cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
             cluster.attr('node', penwidth='0', fontsize='20')
@@ -1148,7 +1187,7 @@ def _make_diagram_outputs(self, diagram, outputs):
         output_variables = []
         outputs_vars = self.get_outputs(outputs)
 
-        with diagram.subgraph(name="cluster_outputs") as cluster:
+        with diagram.subgraph(name='cluster_outputs') as cluster:
             cluster.attr(tooltip='Output variables')
             cluster.attr('graph', rank='source', bgcolor='azure3', penwidth='0')
             cluster.attr('node', penwidth='0', fontsize='20')
diff --git a/setup.py b/setup.py
index 4ff3a675..d76236ae 100644
--- a/setup.py
+++ b/setup.py
@@ -17,12 +17,13 @@
 install_requires = [
     'graphviz>=0.9,<1',
     'numpy>=1.17.1,<1.19',
+    'psutil>=5,<6',
 ]
 
 
 examples_require = [
     'matplotlib>=2.2.2,<3.2.2',
-    'mlprimitives>=0.2.6.dev0,<0.3',
+    'mlprimitives>=0.3.0.dev0,<0.4',
     'boto3>=1.14,<1.14.45',
     'botocore<1.17.45,>=1.17.44',
     'jupyter==1.0.0',
@@ -34,7 +35,7 @@
 tests_require = [
     'pytest>=3.4.2',
     'pytest-cov>=2.6.0',
-    'mlprimitives>=0.2.6.dev0,<0.3',
+    'mlprimitives>=0.3.0.dev0,<0.4',
     'setuptools>=41.0.0',
     'rundoc>=0.4.3',
     'prompt-toolkit>=2.0,<3.0',
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 59e11633..97c59cd0 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -696,7 +696,7 @@ def test_fit_no_debug(self):
         assert returned is None
 
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
-    def test_fit_debug(self):
+    def test_fit_debug_bool(self):
         mlpipeline = MLPipeline(['a_primitive'])
         mlpipeline.blocks['a_primitive#1'].fit_args = [
             {
@@ -706,24 +706,53 @@ def test_fit_debug(self):
         ]
 
         expected_return = dict()
-        expected_return["fit"] = {
-            "a_primitive#1": {
-                "elapsed": 0,
-                "input": {
-                    "whatever"
-                }
+        expected_return['debug'] = 'tmio'
+        expected_return['fit'] = {
+            'a_primitive#1': {
+                'time': 0,
+                'input': {
+                    'whatever'
+                },
+                'memory': 0,
             }
         }
 
         returned = mlpipeline.fit(debug=True)
 
-        print(returned)
         assert isinstance(returned, dict)
         assert set(returned.keys()) == set(expected_return.keys())  # fit / produce
-        assert set(returned["fit"].keys()) == set(expected_return["fit"].keys())  # block name
+        assert set(returned['fit'].keys()) == set(expected_return['fit'].keys())  # block name
+
+        for block_name, dictionary in expected_return['fit'].items():
+            assert set(returned['fit'][block_name].keys()) == set(dictionary.keys())
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_fit_debug_str(self):
+        mlpipeline = MLPipeline(['a_primitive'])
+        mlpipeline.blocks['a_primitive#1'].fit_args = [
+            {
+                'name': 'fit_input',
+                'type': 'whatever'
+            }
+        ]
+
+        expected_return = dict()
+        expected_return['debug'] = 'tm'
+        expected_return['fit'] = {
+            'a_primitive#1': {
+                'time': 0,
+                'memory': 0,
+            }
+        }
+
+        returned = mlpipeline.fit(debug='tm')
+
+        assert isinstance(returned, dict)
+        assert set(returned.keys()) == set(expected_return.keys())  # fit / produce
+        assert set(returned['fit'].keys()) == set(expected_return['fit'].keys())  # block name
 
-        for block_name, dictionary in expected_return["fit"].items():
-            assert set(returned["fit"][block_name].keys()) == set(dictionary.keys())
+        for block_name, dictionary in expected_return['fit'].items():
+            assert set(returned['fit'][block_name].keys()) == set(dictionary.keys())
 
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_fit_produce_debug(self):
@@ -759,39 +788,104 @@ def test_fit_produce_debug(self):
         ]
 
         expected_return = dict()
-        expected_return["fit"] = {
-            "a_primitive#1": {
-                "elapsed": 0,
-                "input": {
-                    "whatever"
-                }
+        expected_return['debug'] = 'tmio'
+        expected_return['fit'] = {
+            'a_primitive#1': {
+                'time': 0,
+                'input': {
+                    'whatever'
+                },
+                'memory': 0,
             }
         }
-        expected_return["produce"] = {
-            "a_primitive#1": {
-                "elapsed": 0,
-                "input": {
-                    "whatever"
+        expected_return['produce'] = {
+            'a_primitive#1': {
+                'time': 0,
+                'input': {
+                    'whatever'
                 },
-                "output": {
-                    "whatever"
-                }
+                'output': {
+                    'whatever'
+                },
+                'memory': 0,
             }
         }
 
         returned, debug_returned = mlpipeline.fit(output_='default', debug=True)
 
-        assert len([returned]) == len(outputs["default"])
+        assert len([returned]) == len(outputs['default'])
+        assert isinstance(debug_returned, dict)
+        assert set(debug_returned.keys()) == set(expected_return.keys())  # fit / produce
+        assert set(debug_returned['fit'].keys()) == set(expected_return['fit'].keys())
+        assert set(debug_returned['produce'].keys()) == set(expected_return['produce'].keys())
+
+        for block_name, dictionary in expected_return['fit'].items():
+            assert set(debug_returned['fit'][block_name].keys()) == set(dictionary.keys())
+
+        for block_name, dictionary in expected_return['produce'].items():
+            assert set(debug_returned['produce'][block_name].keys()) == set(dictionary.keys())
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_fit_produce_debug_str(self):
+        outputs = {
+            'default': [
+                {
+                    'name': 'a_name',
+                    'variable': 'a_primitive#1.a_variable',
+                    'type': 'a_type',
+                }
+            ]
+        }
+        mlpipeline = MLPipeline(['a_primitive'], outputs=outputs)
+        mlpipeline.blocks['a_primitive#1'].fit_args = [
+            {
+                'name': 'fit_input',
+                'type': 'whatever'
+            }
+        ]
+
+        mlpipeline.blocks['a_primitive#1'].produce_args = [
+            {
+                'name': 'input',
+                'type': 'whatever'
+            }
+        ]
+
+        mlpipeline.blocks['a_primitive#1'].produce_output = [
+            {
+                'name': 'a_name',
+                'type': 'a_type'
+            }
+        ]
+
+        expected_return = dict()
+        expected_return['debug'] = 'tm'
+        expected_return['fit'] = {
+            'a_primitive#1': {
+                'time': 0,
+                'memory': 0,
+            }
+        }
+        expected_return['produce'] = {
+            'a_primitive#1': {
+                'time': 0,
+                'memory': 0,
+            }
+        }
+
+        returned, debug_returned = mlpipeline.fit(output_='default', debug='tm')
+
+        assert len([returned]) == len(outputs['default'])
         assert isinstance(debug_returned, dict)
         assert set(debug_returned.keys()) == set(expected_return.keys())  # fit / produce
-        assert set(debug_returned["fit"].keys()) == set(expected_return["fit"].keys())
-        assert set(debug_returned["produce"].keys()) == set(expected_return["produce"].keys())
+        assert set(debug_returned['fit'].keys()) == set(expected_return['fit'].keys())
+        assert set(debug_returned['produce'].keys()) == set(expected_return['produce'].keys())
 
-        for block_name, dictionary in expected_return["fit"].items():
-            assert set(debug_returned["fit"][block_name].keys()) == set(dictionary.keys())
+        for block_name, dictionary in expected_return['fit'].items():
+            assert set(debug_returned['fit'][block_name].keys()) == set(dictionary.keys())
 
-        for block_name, dictionary in expected_return["produce"].items():
-            assert set(debug_returned["produce"][block_name].keys()) == set(dictionary.keys())
+        for block_name, dictionary in expected_return['produce'].items():
+            assert set(debug_returned['produce'][block_name].keys()) == set(dictionary.keys())
 
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_predict_no_debug(self):
@@ -829,9 +923,9 @@ def test_predict_no_debug(self):
         ]
 
         returned = mlpipeline.predict(debug=False)
-        assert len(returned) == len(outputs["default"])
-        for returned_output, expected_output in zip(returned, outputs["default"]):
-            assert returned_output == expected_output["variable"]
+        assert len(returned) == len(outputs['default'])
+        for returned_output, expected_output in zip(returned, outputs['default']):
+            assert returned_output == expected_output['variable']
 
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_predict_debug(self):
@@ -861,18 +955,22 @@ def test_predict_debug(self):
 
         expected_return = dict()
         expected_return = {
-            "a_primitive#1": {
-                "elapsed": 0,
-                "input": {
-                    "whatever"
+            'a_primitive#1': {
+                'time': 0,
+                'input': {
+                    'whatever'
                 },
-                "output": {
-                    "whatever"
-                }
+                'output': {
+                    'whatever'
+                },
+                'memory': 0
             }
         }
+
         returned, debug_returned = mlpipeline.predict(debug=True)
-        assert len([returned]) == len(outputs["default"])
+        debug_returned = debug_returned['produce']
+
+        assert len([returned]) == len(outputs['default'])
         assert isinstance(debug_returned, dict)
         assert set(debug_returned.keys()) == set(expected_return.keys())
 

From 9f9c9a14f22e7d2f52e992562a5cc189c0ed12c8 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 22 Dec 2020 16:00:33 +0100
Subject: [PATCH 125/160] =?UTF-8?q?Bump=20version:=200.3.5.dev0=20?=
 =?UTF-8?q?=E2=86=92=200.4.0.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 300b9093..08618880 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.3.5.dev0'
+__version__ = '0.4.0.dev0'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 61208b1f..32db4562 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.5.dev0
+current_version = 0.4.0.dev0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index d76236ae..a929025f 100644
--- a/setup.py
+++ b/setup.py
@@ -117,6 +117,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.3.5.dev0',
+    version='0.4.0.dev0',
     zip_safe=False,
 )

From 1af7b1bbc617beaab80f453eec01a145e8685032 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Tue, 22 Dec 2020 16:00:48 +0100
Subject: [PATCH 126/160] =?UTF-8?q?Bump=20version:=200.4.0.dev0=20?=
 =?UTF-8?q?=E2=86=92=200.4.0.dev1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 08618880..e3d6fada 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.4.0.dev0'
+__version__ = '0.4.0.dev1'
 
 __all__ = [
     'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
diff --git a/setup.cfg b/setup.cfg
index 32db4562..17998d88 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.0.dev0
+current_version = 0.4.0.dev1
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index a929025f..0eab74aa 100644
--- a/setup.py
+++ b/setup.py
@@ -117,6 +117,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.4.0.dev0',
+    version='0.4.0.dev1',
     zip_safe=False,
 )

From 84460489fc0a0fb2c762f3f16baf4c3e09d5056a Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com>
Date: Fri, 8 Jan 2021 10:16:22 -0500
Subject: [PATCH 127/160] Stop fitting pipeline after last fit block (#132)

* initial early stop

* change  to stop after fitting the last block with  attribute

* test early-stop calls

* remove comment

* change to fit pending
---
 mlblocks/mlpipeline.py   | 35 ++++++++++++++++++++---------
 tests/test_mlpipeline.py | 48 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 10 deletions(-)

diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index a4111bcb..d7935757 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -96,6 +96,7 @@ def _get_tunable_hyperparameters(self):
 
     def _build_blocks(self):
         blocks = OrderedDict()
+        last_fit_block = None
 
         block_names_count = Counter()
         for primitive in self.primitives:
@@ -118,11 +119,14 @@ def _build_blocks(self):
                 block = MLBlock(primitive, **block_params)
                 blocks[block_name] = block
 
+                if bool(block._fit):
+                    last_fit_block = block_name
+
             except Exception:
                 LOGGER.exception('Exception caught building MLBlock %s', primitive)
                 raise
 
-        return blocks
+        return blocks, last_fit_block
 
     @staticmethod
     def _get_pipeline_dict(pipeline, primitives):
@@ -207,7 +211,7 @@ def __init__(self, pipeline=None, primitives=None, init_params=None,
 
         self.primitives = primitives or pipeline['primitives']
         self.init_params = init_params or pipeline.get('init_params', dict())
-        self.blocks = self._build_blocks()
+        self.blocks, self._last_fit_block = self._build_blocks()
         self._last_block_name = self._get_block_name(-1)
 
         self.input_names = input_names or pipeline.get('input_names', dict())
@@ -767,7 +771,11 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
             debug_info = defaultdict(dict)
             debug_info['debug'] = debug.lower() if isinstance(debug, str) else 'tmio'
 
+        fit_pending = True
         for block_name, block in self.blocks.items():
+            if block_name == self._last_fit_block:
+                fit_pending = False
+
             if start_:
                 if block_name == start_:
                     start_ = False
@@ -777,7 +785,7 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
 
             self._fit_block(block, block_name, context, debug_info)
 
-            if (block_name != self._last_block_name) or (block_name in output_blocks):
+            if fit_pending or output_blocks:
                 self._produce_block(
                     block, block_name, context, output_variables, outputs, debug_info)
 
@@ -787,16 +795,23 @@ def fit(self, X=None, y=None, output_=None, start_=None, debug=False, **kwargs):
 
             # If there was an output_ but there are no pending
             # outputs we are done.
-            if output_variables is not None and not output_blocks:
-                if len(outputs) > 1:
-                    result = tuple(outputs)
-                else:
-                    result = outputs[0]
+            if output_variables:
+                if not output_blocks:
+                    if len(outputs) > 1:
+                        result = tuple(outputs)
+                    else:
+                        result = outputs[0]
+
+                    if debug:
+                        return result, debug_info
+
+                    return result
 
+            elif not fit_pending:
                 if debug:
-                    return result, debug_info
+                    return debug_info
 
-                return result
+                return
 
         if start_:
             # We skipped all the blocks up to the end
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 97c59cd0..0ee4cf2c 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -681,6 +681,54 @@ def test_get_inputs_no_fit(self):
 
         assert inputs == expected
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_fit_pending_all_primitives(self):
+        block_1 = get_mlblock_mock()
+        block_2 = get_mlblock_mock()
+        blocks = OrderedDict((
+            ('a.primitive.Name#1', block_1),
+            ('a.primitive.Name#2', block_2),
+        ))
+
+        self_ = MagicMock(autospec=MLPipeline)
+        self_.blocks = blocks
+        self_._last_fit_block = 'a.primitive.Name#2'
+
+        MLPipeline.fit(self_)
+
+        expected = [
+            call('a.primitive.Name#1'),
+            call('a.primitive.Name#2')
+        ]
+        self_._fit_block.call_args_list = expected
+
+        expected = [
+            call('a.primitive.Name#1'),
+        ]
+        self_._produce_block.call_args_list = expected
+
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_fit_pending_one_primitive(self):
+        block_1 = get_mlblock_mock()
+        block_2 = get_mlblock_mock()
+        blocks = OrderedDict((
+            ('a.primitive.Name#1', block_1),
+            ('a.primitive.Name#2', block_2),
+        ))
+
+        self_ = MagicMock(autospec=MLPipeline)
+        self_.blocks = blocks
+        self_._last_fit_block = 'a.primitive.Name#1'
+
+        MLPipeline.fit(self_)
+
+        expected = [
+            call('a.primitive.Name#1'),
+        ]
+        self_._fit_block.call_args_list = expected
+
+        assert not self_._produce_block.called
+
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_fit_no_debug(self):
         mlpipeline = MLPipeline(['a_primitive'])

From 4c2a473c505524e10a850952961a66f35fa41b95 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sat, 9 Jan 2021 16:49:06 +0100
Subject: [PATCH 128/160] Migrate to gh actions (#133)

* Cleanup dependencies and migrate to gh-actions

* add mlprimitives extra
---
 .github/workflows/docs.yml                    | 29 +++++++
 .github/workflows/tests.yml                   | 79 +++++++++++++++++++
 .travis.yml                                   | 30 -------
 Makefile                                      | 45 +++++++++--
 README.md                                     | 21 +++--
 apt.txt                                       |  3 +
 docs/api/mlblocks.datasets.rst                |  5 --
 docs/api/mlblocks.discovery.rst               |  5 --
 docs/index.rst                                |  4 +-
 docs/pipeline_examples/graph.rst              |  2 +-
 docs/pipeline_examples/image.rst              |  6 +-
 docs/pipeline_examples/multi_table.rst        |  2 +-
 docs/pipeline_examples/single_table.rst       |  6 +-
 docs/pipeline_examples/text.rst               |  4 +-
 ...or the best pipeline with BTBSession.ipynb |  2 +-
 mlblocks/__init__.py                          | 16 +++-
 requirements.txt                              |  2 +
 setup.cfg                                     |  1 -
 setup.py                                      | 23 +++---
 tox.ini                                       |  2 +
 20 files changed, 197 insertions(+), 90 deletions(-)
 create mode 100644 .github/workflows/docs.yml
 create mode 100644 .github/workflows/tests.yml
 delete mode 100644 .travis.yml
 create mode 100644 apt.txt
 delete mode 100644 docs/api/mlblocks.datasets.rst
 delete mode 100644 docs/api/mlblocks.discovery.rst
 create mode 100644 requirements.txt

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 00000000..7093b531
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,29 @@
+name: Generate Docs
+
+on:
+  push:
+    branches: [ stable ]
+
+jobs:
+
+  docs:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.8
+
+    - name: Build
+      run: |
+        sudo apt-get install graphviz pandoc
+        python -m pip install --upgrade pip
+        pip install -e .[dev]
+        make docs
+    - name: Deploy
+      uses: peaceiris/actions-gh-pages@v3
+      with:
+        github_token: ${{secrets.GITHUB_TOKEN}}
+        publish_dir: docs/_build/html
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 00000000..ea2c37f5
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,79 @@
+name: Run Tests
+
+on:
+  push:
+    branches: [ '*' ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  devel:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        python-version: [3.8]
+        os: [ubuntu-latest]
+    steps:
+    - uses: actions/checkout@v1
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v1
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install package
+      run: pip install .[dev]
+    - name: make test-devel
+      run: make test-devel
+
+  readme:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        python-version: [3.6, 3.7, 3.8]
+        os: [ubuntu-latest, macos-latest]
+    steps:
+    - uses: actions/checkout@v1
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v1
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install package and dependencies
+      run: pip install rundoc .[mlprimitives]
+    - name: make test-readme
+      run: make test-readme
+
+  unit:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        python-version: [3.6, 3.7, 3.8]
+        os: [ubuntu-latest, macos-latest]
+    steps:
+    - uses: actions/checkout@v1
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v1
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install package and dependencies
+      run: pip install .[test]
+    - name: make test-unit
+      run: make test-unit
+
+  tutorials:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        python-version: [3.6, 3.7, 3.8]
+        os: [ubuntu-latest]
+    steps:
+    - uses: actions/checkout@v1
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v1
+      with:
+        python-version: ${{ matrix.python-version }}
+    - if: matrix.os == 'ubuntu-latest'
+      name: Install dependencies - Ubuntu
+      run: sudo apt-get install graphviz
+    - name: Install package and dependencies
+      run: pip install .[examples]
+    - name: make test-tutorials
+      run: make test-tutorials
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index d2a982f2..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Config file for automatic testing at travis-ci.org
-dist: bionic
-language: python
-python:
-  - 3.8
-  - 3.7
-  - 3.6
-
-# Command to install dependencies
-install:
-  - sudo apt-get update
-  - sudo apt-get install graphviz pandoc
-  - pip install -U tox-travis codecov
-
-# Command to run tests
-script: travis_wait 60 tox
-
-after_success: codecov
-
-deploy:
-
-  - provider: pages
-    skip-cleanup: true
-    github-token: "$GITHUB_TOKEN"
-    keep-history: true
-    local-dir: docs/_build/html
-    target-branch: gh-pages
-    on:
-      branch: master
-      python: 3.8
diff --git a/Makefile b/Makefile
index 6cc80705..c28da455 100644
--- a/Makefile
+++ b/Makefile
@@ -84,6 +84,12 @@ install-test: clean-build clean-pyc ## install the package and test dependencies
 install-develop: clean-build clean-pyc ## install the package in editable mode and dependencies for development
 	pip install -e .[dev]
 
+MINIMUM := $(shell sed -n '/install_requires = \[/,/]/p' setup.py | grep -v -e '[][]' | sed 's/ *\(.*\),$?$$/\1/g' | tr '>' '=')
+
+.PHONY: install-minimum
+install-minimum: ## install the minimum supported versions of the package dependencies
+	pip install $(MINIMUM)
+
 
 # LINT TARGETS
 
@@ -123,7 +129,7 @@ test-readme: ## run the readme snippets
 .PHONY: test-tutorials
 test-tutorials: ## run the tutorial notebooks
 	find examples/tutorials -path "*/.ipynb_checkpoints" -prune -false -o -name "*.ipynb" -exec \
-		jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --stdout --to html {} > /dev/null \;
+		jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --stdout --to html {} > /dev/null +
 
 .PHONY: test
 test: test-unit test-readme ## test everything that needs test dependencies
@@ -154,11 +160,11 @@ docs: clean-docs ## generate Sphinx HTML documentation, including API docs
 	$(MAKE) -C docs html
 
 .PHONY: view-docs
-view-docs: docs ## view docs in browser
+view-docs: ## view the docs in a browser
 	$(BROWSER) docs/_build/html/index.html
 
 .PHONY: serve-docs
-serve-docs: view-docs ## compile the docs watching for changes
+serve-docs: ## compile the docs watching for changes
 	watchmedo shell-command -W -R -D -p '*.rst;*.md' -c '$(MAKE) -C docs html' docs
 
 
@@ -170,12 +176,19 @@ dist: clean ## builds source and wheel package
 	python setup.py bdist_wheel
 	ls -l dist
 
-.PHONY: test-publish
-test-publish: dist ## package and upload a release on TestPyPI
+.PHONY: publish-confirm
+publish-confirm:
+	@echo "WARNING: This will irreversibly upload a new version to PyPI!"
+	@echo -n "Please type 'confirm' to proceed: " \
+		&& read answer \
+		&& [ "$${answer}" = "confirm" ]
+
+.PHONY: publish-test
+publish-test: dist publish-confirm ## package and upload a release on TestPyPI
 	twine upload --repository-url https://test.pypi.org/legacy/ dist/*
 
 .PHONY: publish
-publish: dist ## package and upload a release
+publish: dist publish-confirm ## package and upload a release
 	twine upload dist/*
 
 .PHONY: bumpversion-release
@@ -204,9 +217,21 @@ bumpversion-minor: ## Bump the version the next minor skipping the release
 bumpversion-major: ## Bump the version the next major skipping the release
 	bumpversion --no-tag major
 
+.PHONY: bumpversion-revert
+bumpversion-revert: ## Undo a previous bumpversion-release
+	git checkout master
+	git branch -D stable
+
+CLEAN_DIR := $(shell git status --short | grep -v ??)
 CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null)
 CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l)
 
+.PHONY: check-clean
+check-clean: ## Check if the directory has uncommitted changes
+ifneq ($(CLEAN_DIR),)
+	$(error There are uncommitted changes)
+endif
+
 .PHONY: check-master
 check-master: ## Check if we are in master branch
 ifneq ($(CURRENT_BRANCH),master)
@@ -220,15 +245,21 @@ ifeq ($(CHANGELOG_LINES),0)
 endif
 
 .PHONY: check-release
-check-release: check-master check-history ## Check if the release can be made
+check-release: check-clean check-master check-history ## Check if the release can be made
 	@echo "A new release can be made"
 
 .PHONY: release
 release: check-release bumpversion-release publish bumpversion-patch
 
+.PHONY: release-test
+release-test: check-release bumpversion-release-test publish-test bumpversion-revert
+
 .PHONY: release-candidate
 release-candidate: check-master publish bumpversion-candidate
 
+.PHONY: release-candidate-test
+release-candidate-test: check-clean check-master publish-test
+
 .PHONY: release-minor
 release-minor: check-release bumpversion-minor release
 
diff --git a/README.md b/README.md
index 103fc113..4da013b0 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
 <p align="left">
-<img width=15% src="https://dai.lids.mit.edu/wp-content/uploads/2018/06/Logo_DAI_highres.png" alt=“DAI-Lab” />
-<i>An open source project from Data to AI Lab at MIT.</i>
+  <a href="https://dai.lids.mit.edu/">
+    <img width=15% src="https://dai.lids.mit.edu/wp-content/uploads/2018/06/Logo_DAI_highres.png" alt="DAI-Lab" />
+  </a>
+  <i>An Open Source Project from the <a href="https://dai.lids.mit.edu/">Data to AI Lab, at MIT</a></i>
 </p>
 
 <p align="left">
@@ -13,18 +15,19 @@ Pipelines and Primitives for Machine Learning and Data Science.
 
 [![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
 [![PyPi](https://img.shields.io/pypi/v/mlblocks.svg)](https://pypi.python.org/pypi/mlblocks)
-[![Travis](https://travis-ci.com/MLBazaar/MLBlocks.svg?branch=master)](https://travis-ci.com/MLBazaar/MLBlocks)
+[![Tests](https://github.com/MLBazaar/MLBlocks/workflows/Run%20Tests/badge.svg)](https://github.com/MLBazaar/MLBlocks/actions?query=workflow%3A%22Run+Tests%22+branch%3Amaster)
 [![CodeCov](https://codecov.io/gh/MLBazaar/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/MLBazaar/MLBlocks)
 [![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks)
+[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/MLBazaar/MLBlocks/master?filepath=examples/tutorials)
 
 <br>
 
 # MLBlocks
 
-* Free software: [MIT license](https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE)
-* Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
 * Documentation: https://mlbazaar.github.io/MLBlocks
-* Homepage: https://github.com/MLBazaar/MLBlocks
+* Github: https://github.com/MLBazaar/MLBlocks
+* License: [MIT](https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE)
+* Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
 
 ## Overview
 
@@ -49,11 +52,7 @@ Features include:
 
 **MLBlocks** has been developed and tested on [Python 3.6, 3.7 and 3.8](https://www.python.org/downloads/)
 
-Also, although it is not strictly required, the usage of a
-[virtualenv](https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid
-interfering with other software installed in the system where **MLBlocks** is run.
-
-## Install with pip
+## Install with `pip`
 
 The easiest and recommended way to install **MLBlocks** is using [pip](
 https://pip.pypa.io/en/stable/):
diff --git a/apt.txt b/apt.txt
new file mode 100644
index 00000000..65387721
--- /dev/null
+++ b/apt.txt
@@ -0,0 +1,3 @@
+# apt-get requirements for development and mybinder environment
+graphviz
+pandoc
diff --git a/docs/api/mlblocks.datasets.rst b/docs/api/mlblocks.datasets.rst
deleted file mode 100644
index 6661cd8a..00000000
--- a/docs/api/mlblocks.datasets.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-mlblocks.datasets
-=================
-
-.. automodule:: mlblocks.datasets
-    :members:
diff --git a/docs/api/mlblocks.discovery.rst b/docs/api/mlblocks.discovery.rst
deleted file mode 100644
index c9109130..00000000
--- a/docs/api/mlblocks.discovery.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-mlblocks.discovery
-==================
-
-.. automodule:: mlblocks.discovery
-    :members:
diff --git a/docs/index.rst b/docs/index.rst
index 85717469..25567005 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -6,9 +6,9 @@ What is MLBlocks?
    :alt: MLBlocks
    :align: center
 
-* Free software: `MIT license <https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE>`_
 * Documentation: https://mlbazaar.github.io/MLBlocks
-* Homepage: https://github.com/MLBazaar/MLBlocks
+* Github: https://github.com/MLBazaar/MLBlocks
+* License: `MIT <https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE>`_
 
 MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning
 tools developed in Python, whether they are custom developments or belong to third party
diff --git a/docs/pipeline_examples/graph.rst b/docs/pipeline_examples/graph.rst
index 8cde5340..082d12b6 100644
--- a/docs/pipeline_examples/graph.rst
+++ b/docs/pipeline_examples/graph.rst
@@ -30,7 +30,7 @@ additional information not found inside `X`.
 .. code-block:: python
 
     from mlblocks import MLPipeline
-    from mlblocks.datasets import load_umls
+    from mlprimitives.datasets import load_umls
 
     dataset = load_umls()
     dataset.describe()
diff --git a/docs/pipeline_examples/image.rst b/docs/pipeline_examples/image.rst
index b9b97ef7..e892f915 100644
--- a/docs/pipeline_examples/image.rst
+++ b/docs/pipeline_examples/image.rst
@@ -24,7 +24,7 @@ Gradients using the corresponding `scikit-image function`_ to later on use a sim
 .. code-block:: python
 
     from mlblocks import MLPipeline
-    from mlblocks.datasets import load_usps
+    from mlprimitives.datasets import load_usps
 
     dataset = load_usps()
     dataset.describe()
@@ -61,7 +61,7 @@ and directly after go into a Single Layer CNN Classifier built on Keras using th
 .. code-block:: python
 
     from mlblocks import MLPipeline
-    from mlblocks.datasets import load_usps
+    from mlprimitives.datasets import load_usps
 
     dataset = load_usps()
     dataset.describe()
@@ -107,7 +107,7 @@ to an `XGBRegressor`_ primitive.
 .. code-block:: python
 
     from mlblocks import MLPipeline
-    from mlblocks.datasets import load_handgeometry
+    from mlprimitives.datasets import load_handgeometry
 
     dataset = load_handgeometry()
     dataset.describe()
diff --git a/docs/pipeline_examples/multi_table.rst b/docs/pipeline_examples/multi_table.rst
index c2c2066f..7091a374 100644
--- a/docs/pipeline_examples/multi_table.rst
+++ b/docs/pipeline_examples/multi_table.rst
@@ -25,7 +25,7 @@ tables are.
 .. code-block:: python
 
     from mlblocks import MLPipeline
-    from mlblocks.datasets import load_wikiqa
+    from mlprimitives.datasets import load_wikiqa
 
     dataset = load_wikiqa()
     dataset.describe()
diff --git a/docs/pipeline_examples/single_table.rst b/docs/pipeline_examples/single_table.rst
index ee00d9c6..6a031cb1 100644
--- a/docs/pipeline_examples/single_table.rst
+++ b/docs/pipeline_examples/single_table.rst
@@ -5,7 +5,7 @@ In this section we will go over a few pipeline examples to show **MLBlocks** wor
 in different scenarios and with different types of data.
 
 For each example, we will be using example datasets which can be downloaded using the
-various functions found in the ``mlblocks.datasets`` module.
+various functions found in the ``mlprimitives.datasets`` module.
 
 .. note:: Even though the datasets are not especially big, some of the examples might
           use a considerable amount of resources, especially memory, and might take
@@ -21,7 +21,7 @@ the numeric data from `The Boston Dataset`_, which we will load using the
 .. code-block:: python
 
     from mlblocks import MLPipeline
-    from mlblocks.datasets import load_boston
+    from mlprimitives.datasets import load_boston
 
     dataset = load_boston()
     dataset.describe()
@@ -52,7 +52,7 @@ In this case, we will also be passing some initialization parameters for the XGB
 .. code-block:: python
 
     from mlblocks import MLPipeline
-    from mlblocks.datasets import load_iris
+    from mlprimitives.datasets import load_iris
 
     dataset = load_iris()
     dataset.describe()
diff --git a/docs/pipeline_examples/text.rst b/docs/pipeline_examples/text.rst
index ee0c16ac..75ca3f39 100644
--- a/docs/pipeline_examples/text.rst
+++ b/docs/pipeline_examples/text.rst
@@ -28,7 +28,7 @@ for later ones.
 
     import nltk
     from mlblocks import MLPipeline
-    from mlblocks.datasets import load_newsgroups
+    from mlprimitives.datasets import load_newsgroups
 
     dataset = load_newsgroups()
     dataset.describe()
@@ -105,7 +105,7 @@ to encode all the string features, and go directly into the
 
     import nltk
     from mlblocks import MLPipeline
-    from mlblocks.datasets import load_personae
+    from mlprimitives.datasets import load_personae
 
     dataset = load_personae()
     dataset.describe()
diff --git a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
index 829a38d6..44431d4f 100644
--- a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb	
+++ b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb	
@@ -616,7 +616,7 @@
     }
    ],
    "source": [
-    "session.run(20)"
+    "session.run(10)"
    ]
   },
   {
diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index e3d6fada..8e4e6537 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -11,8 +11,8 @@
 """
 
 from mlblocks.discovery import (
-    add_pipelines_path, add_primitives_path, get_pipelines_paths, get_primitives_paths,
-    load_pipeline, load_primitive)
+    add_pipelines_path, add_primitives_path, find_pipelines, find_primitives, get_pipelines_paths,
+    get_primitives_paths, load_pipeline, load_primitive)
 from mlblocks.mlblock import MLBlock
 from mlblocks.mlpipeline import MLPipeline
 
@@ -23,6 +23,14 @@
 __version__ = '0.4.0.dev1'
 
 __all__ = [
-    'MLBlock', 'MLPipeline', 'add_pipelines_path', 'add_primitives_path',
-    'get_pipelines_paths', 'get_primitives_paths', 'load_pipeline', 'load_primitive'
+    'MLBlock',
+    'MLPipeline',
+    'add_pipelines_path',
+    'add_primitives_path',
+    'find_pipelines',
+    'find_primitives',
+    'get_pipelines_paths',
+    'get_primitives_paths',
+    'load_pipeline',
+    'load_primitive'
 ]
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..3b01f6bf
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+# Requirements for development and mybinder environment
+-e .[dev]
diff --git a/setup.cfg b/setup.cfg
index 17998d88..969e1d64 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -33,7 +33,6 @@ exclude = .tox, .git, __pycache__, .ipynb_checkpoints
 ignore = # Keep empty to prevent default ignores
 
 [isort]
-include_trailing_comment = True
 line_length = 99
 lines_between_types = 0
 multi_line_output = 4
diff --git a/setup.py b/setup.py
index 0eab74aa..c9068f3a 100644
--- a/setup.py
+++ b/setup.py
@@ -5,12 +5,10 @@
 
 from setuptools import find_packages, setup
 
-
-with open('README.md') as readme_file:
+with open('README.md', encoding='utf-8') as readme_file:
     readme = readme_file.read()
 
-
-with open('HISTORY.md') as history_file:
+with open('HISTORY.md', encoding='utf-8') as history_file:
     history = history_file.read()
 
 
@@ -21,13 +19,12 @@
 ]
 
 
-examples_require = [
-    'matplotlib>=2.2.2,<3.2.2',
+mlprimitives_requires = [
     'mlprimitives>=0.3.0.dev0,<0.4',
-    'boto3>=1.14,<1.14.45',
-    'botocore<1.17.45,>=1.17.44',
+]
+
+examples_require = mlprimitives_requires + [
     'jupyter==1.0.0',
-    'docutils<0.16,>=0.10',
     'baytune>=0.3.13.dev0,<0.4',
 ]
 
@@ -79,9 +76,6 @@
     # Documentation style
     'doc8>=0.8.0',
     'pydocstyle>=3.0.0',
-
-    # Prevent travis-ci conflict
-    'chardet<4',
 ]
 
 
@@ -98,16 +92,17 @@
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
     ],
-    description="Pipelines and primitives for machine learning and data science.",
+    description='Pipelines and primitives for machine learning and data science.',
     extras_require={
         'dev': development_requires + tests_require + examples_require,
         'test': tests_require + examples_require,
         'examples': examples_require,
+        'mlprimitives': mlprimitives_requires,
     },
     include_package_data=True,
     install_requires=install_requires,
     keywords='auto machine learning classification regression data science pipeline',
-    license="MIT license",
+    license='MIT license',
     long_description=readme + '\n\n' + history,
     long_description_content_type='text/markdown',
     name='mlblocks',
diff --git a/tox.ini b/tox.ini
index 1bc3f81a..229c1d54 100644
--- a/tox.ini
+++ b/tox.ini
@@ -14,8 +14,10 @@ skip_install = false
 extras = test
 commands =
     /usr/bin/env make test
+    rm -r {envdir}
 
 [testenv:test-devel]
 extras = dev
 commands =
     /usr/bin/env make test-devel
+    rm -r {envdir}

From 6dbcda49319047b1dcf339f5c00b830b61a8ed29 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sat, 9 Jan 2021 17:19:37 +0100
Subject: [PATCH 129/160] Fix dependency conflicts

---
 requirements.txt | 1 +
 setup.py         | 6 ++++--
 tox.ini          | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 3b01f6bf..d2ce3888 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 # Requirements for development and mybinder environment
 -e .[dev]
+docutils<0.16,>=0.10  # Fix dependency conflict on mybinder
diff --git a/setup.py b/setup.py
index c9068f3a..91edced6 100644
--- a/setup.py
+++ b/setup.py
@@ -20,12 +20,14 @@
 
 
 mlprimitives_requires = [
-    'mlprimitives>=0.3.0.dev0,<0.4',
+    'mlprimitives>=0.3.0,<0.4',
+    'h5py<2.11.0,>=2.10.0',  # <- tensorflow 2.3.2 conflict
+    'matplotlib<3.2.2,>=2.2.2',  # <- copulas 0.3.3
 ]
 
 examples_require = mlprimitives_requires + [
     'jupyter==1.0.0',
-    'baytune>=0.3.13.dev0,<0.4',
+    'baytune>=0.4.0,<0.5',
 ]
 
 
diff --git a/tox.ini b/tox.ini
index 229c1d54..e38f071b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -9,6 +9,7 @@ python =
 
 [testenv]
 passenv = CI TRAVIS TRAVIS_*
+allowlist_externals = rm
 skipsdist = false
 skip_install = false
 extras = test

From 2c1e9a3f83bcb937a630b440dbbcef83db4eff4d Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sat, 9 Jan 2021 17:20:33 +0100
Subject: [PATCH 130/160] Add release notes for v0.4.0

---
 HISTORY.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index 17bbda92..da082c25 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,17 @@
 Changelog
 =========
 
+0.4.0 - 2021-01-09
+------------------
+
+* Stop pipeline fitting after the last block - [Issue #131](https://github.com/MLBazaar/MLBlocks/issues/131) by @sarahmish
+* Add memory debug and profiling - [Issue #130](https://github.com/MLBazaar/MLBlocks/issues/130) by @pvk-developer
+* Update Python support - [Issue #129](https://github.com/MLBazaar/MLBlocks/issues/129) by @csala
+* Get execution time for each block - [Issue #127](https://github.com/MLBazaar/MLBlocks/issues/127) by @sarahmish
+* Allow loading a primitive or pipeline directly from the JSON path - [Issue #114](https://github.com/MLBazaar/MLBlocks/issues/114) by @csala
+* Pipeline Diagrams - [Issue #113](https://github.com/MLBazaar/MLBlocks/issues/113) by @erica-chiu
+* Get Pipeline Inputs - [Issue #112](https://github.com/MLBazaar/MLBlocks/issues/112) by @erica-chiu
+
 0.3.4 - 2019-11-01
 ------------------
 

From 04bb5fc72f55a9e2f439bed2d4ec3ae6537f52f4 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sat, 9 Jan 2021 17:22:42 +0100
Subject: [PATCH 131/160] =?UTF-8?q?Bump=20version:=200.4.0.dev1=20?=
 =?UTF-8?q?=E2=86=92=200.4.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 8e4e6537..28a80c5d 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.4.0.dev1'
+__version__ = '0.4.0'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 969e1d64..dc027074 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.0.dev1
+current_version = 0.4.0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 91edced6..0c05c20b 100644
--- a/setup.py
+++ b/setup.py
@@ -114,6 +114,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.4.0.dev1',
+    version='0.4.0',
     zip_safe=False,
 )

From ae9653bfd0ae3e9798071d8bec311cee4e396804 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Sat, 9 Jan 2021 17:22:55 +0100
Subject: [PATCH 132/160] =?UTF-8?q?Bump=20version:=200.4.0=20=E2=86=92=200?=
 =?UTF-8?q?.4.1.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 28a80c5d..61438750 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.4.0'
+__version__ = '0.4.1.dev0'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index dc027074..96b72ce1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.0
+current_version = 0.4.1.dev0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 0c05c20b..db8f5aa6 100644
--- a/setup.py
+++ b/setup.py
@@ -114,6 +114,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.4.0',
+    version='0.4.1.dev0',
     zip_safe=False,
 )

From 098302e83d17d05425bf546077805738abeaebc7 Mon Sep 17 00:00:00 2001
From: Plamen Valentinov Kolev
 <41479552+pvk-developer@users.noreply.github.com>
Date: Mon, 8 Mar 2021 16:27:05 +0100
Subject: [PATCH 133/160] Implement dynamic inputs and outputs. (#135)

* Implement dynamic inputs and outputs.

* Recover block_outputs if it's a string from the block's instance.

* Update tests
---
 mlblocks/mlblock.py      |  13 +++-
 mlblocks/mlpipeline.py   |  16 +++++
 tests/test_mlpipeline.py | 142 ++++++++++++++++++++++++++++++---------
 3 files changed, 136 insertions(+), 35 deletions(-)

diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py
index f570165b..d2295722 100644
--- a/mlblocks/mlblock.py
+++ b/mlblocks/mlblock.py
@@ -111,8 +111,15 @@ def _extract_params(self, kwargs, hyperparameters):
             if name in kwargs:
                 init_params[name] = kwargs.pop(name)
 
-        fit_args = [arg['name'] for arg in self.fit_args]
-        produce_args = [arg['name'] for arg in self.produce_args]
+        if not isinstance(self.fit_args, str):
+            fit_args = [arg['name'] for arg in self.fit_args]
+        else:
+            fit_args = []
+
+        if not isinstance(self.produce_args, str):
+            produce_args = [arg['name'] for arg in self.produce_args]
+        else:
+            produce_args = []
 
         for name in list(kwargs.keys()):
             if name in fit_args:
@@ -257,6 +264,8 @@ def _get_method_kwargs(self, kwargs, method_args):
                 A dictionary containing the argument names and values to pass
                 to the primitive method.
         """
+        if isinstance(method_args, str):
+            method_args = getattr(self.instance, method_args)()
 
         method_kwargs = dict()
         for arg in method_args:
diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py
index d7935757..738b13b0 100644
--- a/mlblocks/mlpipeline.py
+++ b/mlblocks/mlpipeline.py
@@ -177,6 +177,9 @@ def _get_block_variables(self, block_name, variables_attr, names):
         """
         block = self.blocks[block_name]
         variables = deepcopy(getattr(block, variables_attr))
+        if isinstance(variables, str):
+            variables = getattr(block.instance, variables)()
+
         variable_dict = {}
         for variable in variables:
             name = variable['name']
@@ -300,6 +303,12 @@ def get_inputs(self, fit=True):
 
         return inputs
 
+    def get_fit_args(self):
+        return list(self.get_inputs(fit=True).values())
+
+    def get_predict_args(self):
+        return list(self.get_inputs(fit=False).values())
+
     def get_outputs(self, outputs='default'):
         """Get the list of output variables that correspond to the specified outputs.
 
@@ -578,6 +587,10 @@ def _get_block_args(self, block_name, block_args, context):
 
         input_names = self.input_names.get(block_name, dict())
 
+        if isinstance(block_args, str):
+            block = self.blocks[block_name]
+            block_args = getattr(block.instance, block_args)()
+
         kwargs = dict()
         for arg in block_args:
             name = arg['name']
@@ -591,6 +604,9 @@ def _get_block_args(self, block_name, block_args, context):
     def _extract_outputs(self, block_name, outputs, block_outputs):
         """Extract the outputs of the method as a dict to be set into the context."""
         # TODO: type validation and/or transformation should be done here
+        if isinstance(block_outputs, str):
+            block = self.blocks[block_name]
+            block_outputs = getattr(block.instance, block_outputs)()
 
         if not isinstance(outputs, tuple):
             outputs = (outputs, )
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index 0ee4cf2c..be8c6f6b 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -381,6 +381,7 @@ def test_get_outputs_str_named(self):
             ]
         }
         pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs)
+
         returned = pipeline.get_outputs('debug')
 
         expected = [
@@ -389,13 +390,11 @@ def test_get_outputs_str_named(self):
                 'variable': 'another_variable',
             }
         ]
-
         assert returned == expected
 
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_get_outputs_str_variable(self):
         pipeline = MLPipeline(['a_primitive', 'another_primitive'])
-
         pipeline.blocks['a_primitive#1'].produce_output = [
             {
                 'name': 'output',
@@ -412,7 +411,6 @@ def test_get_outputs_str_variable(self):
                 'variable': 'a_primitive#1.output'
             }
         ]
-
         assert returned == expected
 
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
@@ -427,7 +425,6 @@ def test_get_outputs_str_block(self):
                 'variable': 'a_primitive#1',
             }
         ]
-
         assert returned == expected
 
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
@@ -442,7 +439,6 @@ def test_get_outputs_int(self):
                 'variable': 'another_primitive#1',
             }
         ]
-
         assert returned == expected
 
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
@@ -463,7 +459,6 @@ def test_get_outputs_combination(self):
             ]
         }
         pipeline = MLPipeline(['a_primitive', 'another_primitive'], outputs=outputs)
-
         pipeline.blocks['a_primitive#1'].produce_output = [
             {
                 'name': 'output',
@@ -498,7 +493,6 @@ def test_get_outputs_combination(self):
                 'variable': 'a_primitive#1.output'
             }
         ]
-
         assert returned == expected
 
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
@@ -550,21 +544,39 @@ def test_get_output_variables(self):
         assert names == ['a_variable']
 
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
-    def test__get_block_variables(self):
+    def test__get_block_variables_is_dict(self):
+        pipeline = MLPipeline(['a_primitive'])
+        pipeline.blocks['a_primitive#1'].produce_outputs = [
+            {
+                'name': 'output',
+                'type': 'whatever'
+            }
+        ]
+
+        outputs = pipeline._get_block_variables(
+            'a_primitive#1',
+            'produce_outputs',
+            {'output': 'name_output'}
+        )
+
         expected = {
             'name_output': {
                 'name': 'output',
                 'type': 'whatever',
             }
         }
+        assert outputs == expected
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test__get_block_variables_is_str(self):
         pipeline = MLPipeline(['a_primitive'])
-
-        pipeline.blocks['a_primitive#1'].produce_outputs = [
+        pipeline.blocks['a_primitive#1'].produce_outputs = 'get_produce_outputs'
+        pipeline.blocks['a_primitive#1'].instance.get_produce_outputs.return_value = [
             {
-                'name': 'output',
-                'type': 'whatever'
+                'name': 'output_from_function',
+                'type': 'test'
             }
+
         ]
 
         outputs = pipeline._get_block_variables(
@@ -572,10 +584,50 @@ def test__get_block_variables(self):
             'produce_outputs',
             {'output': 'name_output'}
         )
+
+        expected = {
+            'output_from_function': {
+                'name': 'output_from_function',
+                'type': 'test',
+            }
+        }
         assert outputs == expected
+        pipeline.blocks['a_primitive#1'].instance.get_produce_outputs.assert_called_once_with()
 
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_get_inputs_fit(self):
+        pipeline = MLPipeline(['a_primitive', 'another_primitive'])
+        pipeline.blocks['a_primitive#1'].produce_args = [
+            {
+                'name': 'input',
+                'type': 'whatever'
+            }
+        ]
+        pipeline.blocks['a_primitive#1'].fit_args = [
+            {
+                'name': 'fit_input',
+                'type': 'whatever'
+            }
+        ]
+        pipeline.blocks['a_primitive#1'].produce_output = [
+            {
+                'name': 'output',
+                'type': 'another_whatever'
+            }
+        ]
+        pipeline.blocks['another_primitive#1'].produce_args = [
+            {
+                'name': 'output',
+                'type': 'another_whatever'
+            },
+            {
+                'name': 'another_input',
+                'type': 'another_whatever'
+            }
+        ]
+
+        inputs = pipeline.get_inputs()
+
         expected = {
             'input': {
                 'name': 'input',
@@ -589,32 +641,30 @@ def test_get_inputs_fit(self):
                 'name': 'another_input',
                 'type': 'another_whatever',
             }
-
         }
+        assert inputs == expected
 
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_get_inputs_no_fit(self):
         pipeline = MLPipeline(['a_primitive', 'another_primitive'])
-
         pipeline.blocks['a_primitive#1'].produce_args = [
             {
                 'name': 'input',
                 'type': 'whatever'
             }
         ]
-
         pipeline.blocks['a_primitive#1'].fit_args = [
             {
                 'name': 'fit_input',
                 'type': 'whatever'
             }
         ]
-
         pipeline.blocks['a_primitive#1'].produce_output = [
             {
                 'name': 'output',
                 'type': 'another_whatever'
             }
         ]
-
         pipeline.blocks['another_primitive#1'].produce_args = [
             {
                 'name': 'output',
@@ -626,11 +676,8 @@ def test_get_inputs_fit(self):
             }
         ]
 
-        inputs = pipeline.get_inputs()
-        assert inputs == expected
+        inputs = pipeline.get_inputs(fit=False)
 
-    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
-    def test_get_inputs_no_fit(self):
         expected = {
             'input': {
                 'name': 'input',
@@ -640,25 +687,24 @@ def test_get_inputs_no_fit(self):
                 'name': 'another_input',
                 'type': 'another_whatever',
             }
-
         }
+        assert inputs == expected
 
-        pipeline = MLPipeline(['a_primitive', 'another_primitive'])
-
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_get_fit_args(self):
+        pipeline = MLPipeline(['a_primitive'])
         pipeline.blocks['a_primitive#1'].produce_args = [
             {
                 'name': 'input',
                 'type': 'whatever'
             }
         ]
-
         pipeline.blocks['a_primitive#1'].fit_args = [
             {
                 'name': 'fit_input',
                 'type': 'whatever'
             }
         ]
-
         pipeline.blocks['a_primitive#1'].produce_output = [
             {
                 'name': 'output',
@@ -666,20 +712,50 @@ def test_get_inputs_no_fit(self):
             }
         ]
 
-        pipeline.blocks['another_primitive#1'].produce_args = [
+        outputs = pipeline.get_fit_args()
+
+        expected = [
             {
-                'name': 'output',
-                'type': 'another_whatever'
+                'name': 'input',
+                'type': 'whatever'
             },
             {
-                'name': 'another_input',
-                'type': 'another_whatever'
+                'name': 'fit_input',
+                'type': 'whatever',
             }
         ]
+        assert outputs == expected
 
-        inputs = pipeline.get_inputs(fit=False)
+    @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
+    def test_get_predict_args(self):
+        pipeline = MLPipeline(['a_primitive'])
+        pipeline.blocks['a_primitive#1'].produce_args = [
+            {
+                'name': 'input',
+                'type': 'whatever'
+            }
+        ]
+        pipeline.blocks['a_primitive#1'].fit_args = [
+            {
+                'name': 'fit_input',
+                'type': 'whatever'
+            }
+        ]
+        pipeline.blocks['a_primitive#1'].produce_output = [
+            {
+                'name': 'output',
+                'type': 'another_whatever'
+            }
+        ]
+        outputs = pipeline.get_predict_args()
 
-        assert inputs == expected
+        expected = [
+            {
+                'name': 'input',
+                'type': 'whatever'
+            }
+        ]
+        assert outputs == expected
 
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_fit_pending_all_primitives(self):

From 286e0f207d569eff4d2b1a52aeb128965a5372a7 Mon Sep 17 00:00:00 2001
From: Plamen Valentinov Kolev <pvkdeveloper@gmx.com>
Date: Mon, 8 Mar 2021 18:08:19 +0100
Subject: [PATCH 134/160] =?UTF-8?q?Bump=20version:=200.4.1.dev0=20?=
 =?UTF-8?q?=E2=86=92=200.4.1.dev1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 61438750..5e8d665e 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.4.1.dev0'
+__version__ = '0.4.1.dev1'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 96b72ce1..e75ffe48 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.1.dev0
+current_version = 0.4.1.dev1
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index db8f5aa6..a48b031f 100644
--- a/setup.py
+++ b/setup.py
@@ -114,6 +114,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.4.1.dev0',
+    version='0.4.1.dev1',
     zip_safe=False,
 )

From ae1cdd66a10bb0e6341ab716e1fdb7ca7fc51bae Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com>
Date: Mon, 27 Sep 2021 10:38:04 -0400
Subject: [PATCH 135/160] Update dependencies (#136)

* Increase numpy cap
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index a48b031f..78f4053a 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
 
 install_requires = [
     'graphviz>=0.9,<1',
-    'numpy>=1.17.1,<1.19',
+    'numpy>=1.17.1,<1.21',
     'psutil>=5,<6',
 ]
 

From 3585628764bcb0bb2e06348eed4a90da5df3d4df Mon Sep 17 00:00:00 2001
From: sarahmish <sarahalnegheimish@gmail.com>
Date: Fri, 8 Oct 2021 10:55:56 -0400
Subject: [PATCH 136/160] =?UTF-8?q?Bump=20version:=200.4.1.dev1=20?=
 =?UTF-8?q?=E2=86=92=200.4.1.dev2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 5e8d665e..f3ead991 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.4.1.dev1'
+__version__ = '0.4.1.dev2'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index e75ffe48..b106c1e6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.1.dev1
+current_version = 0.4.1.dev2
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 78f4053a..6a193b32 100644
--- a/setup.py
+++ b/setup.py
@@ -114,6 +114,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.4.1.dev1',
+    version='0.4.1.dev2',
     zip_safe=False,
 )

From e8d353da3bf2585d4cbed40f07dda93529690196 Mon Sep 17 00:00:00 2001
From: sarahmish <sarahalnegheimish@gmail.com>
Date: Fri, 8 Oct 2021 12:06:57 -0400
Subject: [PATCH 137/160] prepare release notes

---
 HISTORY.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index da082c25..0575c034 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,12 @@
 Changelog
 =========
 
+0.4.1 - 2021-10-08
+------------------
+
+* Update NumPy dependency - [Issue #136](https://github.com/MLBazaar/MLBlocks/issues/136) by @sarahmish
+* Support dynamic inputs and outputs - [Issue #134](https://github.com/MLBazaar/MLBlocks/issues/134) by @pvk-developer
+
 0.4.0 - 2021-01-09
 ------------------
 

From 16ba53c557a770760bb46fbf17566891a258cdb3 Mon Sep 17 00:00:00 2001
From: sarahmish <sarahalnegheimish@gmail.com>
Date: Fri, 8 Oct 2021 12:07:26 -0400
Subject: [PATCH 138/160] =?UTF-8?q?Bump=20version:=200.4.1.dev2=20?=
 =?UTF-8?q?=E2=86=92=200.4.1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index f3ead991..9c9d5d13 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.4.1.dev2'
+__version__ = '0.4.1'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index b106c1e6..84f59fab 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.1.dev2
+current_version = 0.4.1
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 6a193b32..b7c717be 100644
--- a/setup.py
+++ b/setup.py
@@ -114,6 +114,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.4.1.dev2',
+    version='0.4.1',
     zip_safe=False,
 )

From 515d0a7af4e6466014333eace818d3a64a2ce46b Mon Sep 17 00:00:00 2001
From: sarahmish <sarahalnegheimish@gmail.com>
Date: Fri, 8 Oct 2021 12:07:41 -0400
Subject: [PATCH 139/160] =?UTF-8?q?Bump=20version:=200.4.1=20=E2=86=92=200?=
 =?UTF-8?q?.4.2.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 9c9d5d13..9c42ed1a 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.4.1'
+__version__ = '0.4.2.dev0'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 84f59fab..fc9e4e12 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.1
+current_version = 0.4.2.dev0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index b7c717be..c0432aa4 100644
--- a/setup.py
+++ b/setup.py
@@ -114,6 +114,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.4.1',
+    version='0.4.2.dev0',
     zip_safe=False,
 )

From 79fc8fbc4632f164102c4973badd13cd38c31e84 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com>
Date: Sun, 11 Dec 2022 16:12:08 -0600
Subject: [PATCH 140/160] Update `numpy` dependency  (#139)

* push numpy cap

* add separate tests for mlblocks

* fix command

* create new unit test environment

* pin jinja2

* pin markupsafe

* add docutils

* pin scikit learn for docs

* unpin scikit-learn and add okwarning
---
 .github/workflows/tests.yml         | 35 +++++++++++++++++++++--------
 Makefile                            |  6 ++++-
 docs/getting_started/quickstart.rst |  2 ++
 setup.py                            |  8 +++++--
 tests/test_mlpipeline.py            |  6 ++---
 5 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index ea2c37f5..4cb525ed 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -16,7 +16,7 @@ jobs:
     steps:
     - uses: actions/checkout@v1
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v1
+      uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install package
@@ -29,11 +29,11 @@ jobs:
     strategy:
       matrix:
         python-version: [3.6, 3.7, 3.8]
-        os: [ubuntu-latest, macos-latest]
+        os: [ubuntu-20.04, macos-latest]
     steps:
     - uses: actions/checkout@v1
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v1
+      uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install package and dependencies
@@ -46,31 +46,48 @@ jobs:
     strategy:
       matrix:
         python-version: [3.6, 3.7, 3.8]
-        os: [ubuntu-latest, macos-latest]
+        os: [ubuntu-20.04, macos-latest]
     steps:
     - uses: actions/checkout@v1
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v1
+      uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install package and dependencies
-      run: pip install .[test]
+      run: pip install .[unit]
     - name: make test-unit
       run: make test-unit
 
+  unit-mlprimitives:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        python-version: [3.6, 3.7, 3.8]
+        os: [ubuntu-20.04, macos-latest]
+    steps:
+    - uses: actions/checkout@v1
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install package and dependencies
+      run: pip install .[test]
+    - name: make test-mlprimitives
+      run: make test-mlprimitives
+
   tutorials:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
         python-version: [3.6, 3.7, 3.8]
-        os: [ubuntu-latest]
+        os: [ubuntu-20.04]
     steps:
     - uses: actions/checkout@v1
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v1
+      uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python-version }}
-    - if: matrix.os == 'ubuntu-latest'
+    - if: matrix.os == 'ubuntu-20.04'
       name: Install dependencies - Ubuntu
       run: sudo apt-get install graphviz
     - name: Install package and dependencies
diff --git a/Makefile b/Makefile
index c28da455..2ae6c7c3 100644
--- a/Makefile
+++ b/Makefile
@@ -118,6 +118,10 @@ lint-docs: ## check docs formatting with doc8 and pydocstyle
 
 .PHONY: test-unit
 test-unit: ## run tests quickly with the default Python
+	python -m pytest --cov=mlblocks --ignore=tests/features/
+
+.PHONY: test-mlprimitives
+test-mlprimitives: ## run tests quickly with the default Python
 	python -m pytest --cov=mlblocks
 
 .PHONY: test-readme
@@ -132,7 +136,7 @@ test-tutorials: ## run the tutorial notebooks
 		jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --stdout --to html {} > /dev/null +
 
 .PHONY: test
-test: test-unit test-readme ## test everything that needs test dependencies
+test: test-unit test-mlprimitives test-readme ## test everything that needs test dependencies
 
 .PHONY: check-dependencies
 check-dependencies: ## test if there are any broken dependencies
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
index 386752dc..f0cb9a3f 100644
--- a/docs/getting_started/quickstart.rst
+++ b/docs/getting_started/quickstart.rst
@@ -102,6 +102,7 @@ To do this, we first call the ``fit`` method passing the training data and the c
 labels.
 
 .. ipython:: python
+    :okwarning:
 
     from mlprimitives.datasets import load_census
     dataset = load_census()
@@ -112,6 +113,7 @@ Once we have fitted our model to our data, we can call the ``predict`` method pa
 to obtain predictions from the pipeline.
 
 .. ipython:: python
+    :okwarning:
 
     predictions = pipeline.predict(X_test)
     predictions
diff --git a/setup.py b/setup.py
index c0432aa4..85b05bcd 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
 
 install_requires = [
     'graphviz>=0.9,<1',
-    'numpy>=1.17.1,<1.21',
+    'numpy>=1.17.1,<2',
     'psutil>=5,<6',
 ]
 
@@ -23,6 +23,7 @@
     'mlprimitives>=0.3.0,<0.4',
     'h5py<2.11.0,>=2.10.0',  # <- tensorflow 2.3.2 conflict
     'matplotlib<3.2.2,>=2.2.2',  # <- copulas 0.3.3
+    'protobuf<4', # <- importlib
 ]
 
 examples_require = mlprimitives_requires + [
@@ -34,7 +35,6 @@
 tests_require = [
     'pytest>=3.4.2',
     'pytest-cov>=2.6.0',
-    'mlprimitives>=0.3.0.dev0,<0.4',
     'setuptools>=41.0.0',
     'rundoc>=0.4.3',
     'prompt-toolkit>=2.0,<3.0',
@@ -56,8 +56,11 @@
     'm2r>=0.2.0,<0.3',
     'Sphinx>=1.7.1,<3',
     'sphinx_rtd_theme>=0.2.4,<0.5',
+    'docutils>=0.12,<0.18',
     'ipython>=6.5.0',
     'autodocsumm>=0.1.10',
+    'Jinja2>=2,<3', # >=3 makes sphinx theme fail
+    'markupsafe<2.1.0',
 
     # style check
     'flake8>=3.7.7,<4',
@@ -97,6 +100,7 @@
     description='Pipelines and primitives for machine learning and data science.',
     extras_require={
         'dev': development_requires + tests_require + examples_require,
+        'unit': tests_require,
         'test': tests_require + examples_require,
         'examples': examples_require,
         'mlprimitives': mlprimitives_requires,
diff --git a/tests/test_mlpipeline.py b/tests/test_mlpipeline.py
index be8c6f6b..084eac3d 100644
--- a/tests/test_mlpipeline.py
+++ b/tests/test_mlpipeline.py
@@ -1124,7 +1124,7 @@ def test_get_diagram_simple(self):
         ]
         pipeline.blocks['a_primitive#1'].produce_output = output
 
-        assert str(pipeline.get_diagram()) == expected
+        assert str(pipeline.get_diagram()).strip() == expected.strip()
 
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_get_diagram_fit(self):
@@ -1155,7 +1155,7 @@ def test_get_diagram_fit(self):
         ]
         pipeline.blocks['a_primitive#1'].produce_output = output
 
-        assert str(pipeline.get_diagram()) == expected
+        assert str(pipeline.get_diagram()).strip() == expected.strip()
 
     @patch('mlblocks.mlpipeline.MLBlock', new=get_mlblock_mock)
     def test_get_diagram_multiple_blocks(self):
@@ -1189,7 +1189,7 @@ def test_get_diagram_multiple_blocks(self):
         pipeline.blocks['b_primitive#1'].produce_args = first_output
         pipeline.blocks['b_primitive#1'].produce_output = second_output
 
-        assert str(pipeline.get_diagram()) == expected
+        assert str(pipeline.get_diagram()).strip() == expected.strip()
 
     def test_fit(self):
         pass

From 40c5c413dc62cd1e38b6fa8e40fc858b6ac54479 Mon Sep 17 00:00:00 2001
From: sarahmish <sarahalnegheimish@gmail.com>
Date: Sun, 22 Jan 2023 16:25:44 -0500
Subject: [PATCH 141/160] add release notes

---
 HISTORY.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index 0575c034..c183b575 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,11 @@
 Changelog
 =========
 
+0.5.0 - 2023-01-22
+------------------
+
+* Update `numpy` dependency and isolate tests - [Issue #139](https://github.com/MLBazaar/MLBlocks/issues/139) by @sarahmish
+
 0.4.1 - 2021-10-08
 ------------------
 

From a4ba9c4e588d88b95797117e2562100bb76e6def Mon Sep 17 00:00:00 2001
From: sarahmish <sarahalnegheimish@gmail.com>
Date: Sun, 22 Jan 2023 16:26:03 -0500
Subject: [PATCH 142/160] =?UTF-8?q?Bump=20version:=200.4.2.dev0=20?=
 =?UTF-8?q?=E2=86=92=200.5.0.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 9c42ed1a..82a61ca3 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.4.2.dev0'
+__version__ = '0.5.0.dev0'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index fc9e4e12..d21c7a1a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.2.dev0
+current_version = 0.5.0.dev0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 85b05bcd..4926b10a 100644
--- a/setup.py
+++ b/setup.py
@@ -118,6 +118,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.4.2.dev0',
+    version='0.5.0.dev0',
     zip_safe=False,
 )

From 8140e3dcfe017e2a1e04ada9c6783f2dcdf30198 Mon Sep 17 00:00:00 2001
From: sarahmish <sarahalnegheimish@gmail.com>
Date: Sun, 22 Jan 2023 16:26:03 -0500
Subject: [PATCH 143/160] =?UTF-8?q?Bump=20version:=200.5.0.dev0=20?=
 =?UTF-8?q?=E2=86=92=200.5.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 82a61ca3..7cc2da30 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.5.0.dev0'
+__version__ = '0.5.0'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index d21c7a1a..746b4d2f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.5.0.dev0
+current_version = 0.5.0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 4926b10a..8b11e6ff 100644
--- a/setup.py
+++ b/setup.py
@@ -118,6 +118,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.5.0.dev0',
+    version='0.5.0',
     zip_safe=False,
 )

From a70b30713416ca1bc1a4cf2c2675cda383e28ca8 Mon Sep 17 00:00:00 2001
From: sarahmish <sarahalnegheimish@gmail.com>
Date: Sun, 22 Jan 2023 16:26:18 -0500
Subject: [PATCH 144/160] =?UTF-8?q?Bump=20version:=200.5.0=20=E2=86=92=200?=
 =?UTF-8?q?.5.1.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 7cc2da30..3e7aa671 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.5.0'
+__version__ = '0.5.1.dev0'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 746b4d2f..70204a8c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.5.0
+current_version = 0.5.1.dev0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 8b11e6ff..70d599ea 100644
--- a/setup.py
+++ b/setup.py
@@ -118,6 +118,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.5.0',
+    version='0.5.1.dev0',
     zip_safe=False,
 )

From c74137e6a52c141d2bc10bb8b11de5b72e83ea07 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com>
Date: Thu, 13 Apr 2023 14:17:52 -0400
Subject: [PATCH 145/160] Upgrade python (#142)

* add python 3.9 and 3.10

* fix python specification

* update python version in readme
---
 .github/workflows/tests.yml | 2 +-
 Makefile                    | 4 ++++
 README.md                   | 2 +-
 setup.py                    | 4 +++-
 tox.ini                     | 4 +++-
 5 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 4cb525ed..3f46f728 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -45,7 +45,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8]
+        python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
         os: [ubuntu-20.04, macos-latest]
     steps:
     - uses: actions/checkout@v1
diff --git a/Makefile b/Makefile
index 2ae6c7c3..4fa8cc04 100644
--- a/Makefile
+++ b/Makefile
@@ -76,6 +76,10 @@ install: clean-build clean-pyc ## install the package to the active Python's sit
 install-examples: clean-build clean-pyc ## install the package and the examples dependencies
 	pip install .[examples]
 
+.PHONY: install-unit
+install-unit: clean-build clean-pyc ## install the package and dependencies for unit tests
+	pip install .[unit]
+
 .PHONY: install-test
 install-test: clean-build clean-pyc ## install the package and test dependencies
 	pip install .[test]
diff --git a/README.md b/README.md
index 4da013b0..13c23c3a 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ Features include:
 
 ## Requirements
 
-**MLBlocks** has been developed and tested on [Python 3.6, 3.7 and 3.8](https://www.python.org/downloads/)
+**MLBlocks** has been developed and tested on [Python 3.6, 3.7, 3.8, 3.9, and 3.10](https://www.python.org/downloads/)
 
 ## Install with `pip`
 
diff --git a/setup.py b/setup.py
index 70d599ea..17159dbb 100644
--- a/setup.py
+++ b/setup.py
@@ -96,6 +96,8 @@
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
     ],
     description='Pipelines and primitives for machine learning and data science.',
     extras_require={
@@ -113,7 +115,7 @@
     long_description_content_type='text/markdown',
     name='mlblocks',
     packages=find_packages(include=['mlblocks', 'mlblocks.*']),
-    python_requires='>=3.6,<3.9',
+    python_requires='>=3.6,<3.11',
     setup_requires=setup_requires,
     test_suite='tests',
     tests_require=tests_require,
diff --git a/tox.ini b/tox.ini
index e38f071b..a589526a 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,8 +1,10 @@
 [tox]
-envlist = py3{6,7,8}, test-devel
+envlist = py3{6,7,8,9,10}, test-devel
 
 [travis]
 python =
+    3.10: py10
+    3.9: py39
     3.8: py38, test-devel
     3.7: py37
     3.6: py36

From b85983d956699c5863e153816543fb6f29bdb8ff Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <sarahalnegheimish@gmail.com>
Date: Fri, 14 Apr 2023 14:28:08 -0400
Subject: [PATCH 146/160] =?UTF-8?q?Bump=20version:=200.5.1.dev0=20?=
 =?UTF-8?q?=E2=86=92=200.5.1.dev1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 3e7aa671..3b880bb8 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.5.1.dev0'
+__version__ = '0.5.1.dev1'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 70204a8c..40f0d06a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.5.1.dev0
+current_version = 0.5.1.dev1
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 17159dbb..9ab20327 100644
--- a/setup.py
+++ b/setup.py
@@ -120,6 +120,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.5.1.dev0',
+    version='0.5.1.dev1',
     zip_safe=False,
 )

From 6597bfa501bc341e27f48e2ca357a9b61a17a854 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <sarahalnegheimish@gmail.com>
Date: Fri, 14 Apr 2023 15:21:18 -0400
Subject: [PATCH 147/160] add release notes

---
 HISTORY.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index c183b575..f1c4209f 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,11 @@
 Changelog
 =========
 
+0.6.0 - 2023-04-14
+------------------
+
+* Support python 3.9 and 3.10 - [Issue #141](https://github.com/MLBazaar/MLBlocks/issues/141) by @sarahmish
+
 0.5.0 - 2023-01-22
 ------------------
 

From 1cc2551142cc21165a09f52063545b3edd02fed7 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <sarahalnegheimish@gmail.com>
Date: Fri, 14 Apr 2023 15:21:31 -0400
Subject: [PATCH 148/160] =?UTF-8?q?Bump=20version:=200.5.1.dev1=20?=
 =?UTF-8?q?=E2=86=92=200.6.0.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 3b880bb8..344fd4b2 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.5.1.dev1'
+__version__ = '0.6.0.dev0'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 40f0d06a..4637a833 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.5.1.dev1
+current_version = 0.6.0.dev0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 9ab20327..80137119 100644
--- a/setup.py
+++ b/setup.py
@@ -120,6 +120,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.5.1.dev1',
+    version='0.6.0.dev0',
     zip_safe=False,
 )

From f934db0d36f4d4965707092209fcafdba74dc330 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <sarahalnegheimish@gmail.com>
Date: Fri, 14 Apr 2023 15:21:31 -0400
Subject: [PATCH 149/160] =?UTF-8?q?Bump=20version:=200.6.0.dev0=20?=
 =?UTF-8?q?=E2=86=92=200.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 344fd4b2..650b26ca 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.6.0.dev0'
+__version__ = '0.6.0'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 4637a833..2800a7f1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.0.dev0
+current_version = 0.6.0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 80137119..fd8791a8 100644
--- a/setup.py
+++ b/setup.py
@@ -120,6 +120,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.6.0.dev0',
+    version='0.6.0',
     zip_safe=False,
 )

From ec8433590f8e928484f49ea0a76543caf7f117b5 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <sarahalnegheimish@gmail.com>
Date: Fri, 14 Apr 2023 15:21:51 -0400
Subject: [PATCH 150/160] =?UTF-8?q?Bump=20version:=200.6.0=20=E2=86=92=200?=
 =?UTF-8?q?.6.1.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 650b26ca..021d9734 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.6.0'
+__version__ = '0.6.1.dev0'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 2800a7f1..40e7b099 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.0
+current_version = 0.6.1.dev0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index fd8791a8..c9658a63 100644
--- a/setup.py
+++ b/setup.py
@@ -120,6 +120,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.6.0',
+    version='0.6.1.dev0',
     zip_safe=False,
 )

From 21f0df503609fe256ca9711b98fd92f4b83a522e Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com>
Date: Tue, 26 Sep 2023 10:43:52 -0400
Subject: [PATCH 151/160] Add python 3.11 to MLBlocks (#143)

* test python 3.11

* pin lightfm

* update pip

* fix syntax

* add wheel

* fix data loading

* fix readme example

* remove data
---
 .github/workflows/tests.yml                   | 18 ++++++-
 README.md                                     | 12 +++--
 docs/getting_started/quickstart.rst           | 14 +++--
 .../tutorials/1. Using and MLPipeline.ipynb   | 23 +++++---
 .... Setting MLPipeline Hyperparameters.ipynb | 24 +++++++--
 .../4. Saving and Loading a Pipeline.ipynb    | 19 +++++--
 ...ial execution and pipeline debugging.ipynb | 19 +++++--
 .../6. Flexible outputs specification.ipynb   | 30 ++++++++---
 examples/tutorials/7. Tuning a Pipeline.ipynb |  4 +-
 ...or the best pipeline with BTBSession.ipynb | 20 +++----
 examples/tutorials/utils.py                   | 52 +++++++++++++++++++
 setup.py                                      |  3 +-
 tox.ini                                       |  3 +-
 13 files changed, 187 insertions(+), 54 deletions(-)
 create mode 100644 examples/tutorials/utils.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 3f46f728..0eb00220 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -19,6 +19,10 @@ jobs:
       uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python-version }}
+    - name: Upgrade pip
+      run: pip install -U pip setuptools wheel
+    - name: Install lightfm
+      run: python -m pip install --no-use-pep517 'lightfm<2'
     - name: Install package
       run: pip install .[dev]
     - name: make test-devel
@@ -36,6 +40,10 @@ jobs:
       uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python-version }}
+    - name: Upgrade pip
+      run: pip install -U pip setuptools wheel
+    - name: Install lightfm
+      run: python -m pip install --no-use-pep517 'lightfm<2'
     - name: Install package and dependencies
       run: pip install rundoc .[mlprimitives]
     - name: make test-readme
@@ -45,7 +53,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
+        python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11']
         os: [ubuntu-20.04, macos-latest]
     steps:
     - uses: actions/checkout@v1
@@ -70,6 +78,10 @@ jobs:
       uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python-version }}
+    - name: Upgrade pip
+      run: pip install -U pip setuptools wheel
+    - name: Install lightfm
+      run: python -m pip install --no-use-pep517 'lightfm<2'
     - name: Install package and dependencies
       run: pip install .[test]
     - name: make test-mlprimitives
@@ -90,6 +102,10 @@ jobs:
     - if: matrix.os == 'ubuntu-20.04'
       name: Install dependencies - Ubuntu
       run: sudo apt-get install graphviz
+    - name: Upgrade pip
+      run: pip install -U pip setuptools wheel
+    - name: Install lightfm
+      run: python -m pip install --no-use-pep517 'lightfm<2'
     - name: Install package and dependencies
       run: pip install .[examples]
     - name: make test-tutorials
diff --git a/README.md b/README.md
index 13c23c3a..662a3ed3 100644
--- a/README.md
+++ b/README.md
@@ -86,11 +86,15 @@ pipeline which combines primitives from [MLPrimitives](https://github.com/MLBaza
 [scikit-learn](https://scikit-learn.org/) and [xgboost](https://xgboost.readthedocs.io/).
 
 ```python3
+import pandas as pd
 from mlblocks import MLPipeline
-from mlprimitives.datasets import load_dataset
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
 
-dataset = load_dataset('census')
-X_train, X_test, y_train, y_test = dataset.get_splits(1)
+dataset = pd.read_csv('http://mlblocks.s3.amazonaws.com/census.csv')
+label = dataset.pop('label')
+
+X_train, X_test, y_train, y_test = train_test_split(dataset, label, stratify=label)
 
 primitives = [
     'mlprimitives.custom.preprocessing.ClassEncoder',
@@ -104,7 +108,7 @@ pipeline = MLPipeline(primitives)
 pipeline.fit(X_train, y_train)
 predictions = pipeline.predict(X_test)
 
-dataset.score(y_test, predictions)
+accuracy_score(y_test, predictions)
 ```
 
 # What's Next?
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
index f0cb9a3f..55c20d86 100644
--- a/docs/getting_started/quickstart.rst
+++ b/docs/getting_started/quickstart.rst
@@ -104,9 +104,13 @@ labels.
 .. ipython:: python
     :okwarning:
 
-    from mlprimitives.datasets import load_census
-    dataset = load_census()
-    X_train, X_test, y_train, y_test = dataset.get_splits(1)
+    import pandas as pd
+    from sklearn.model_selection import train_test_split
+
+    dataset = pd.read_csv('http://mlblocks.s3.amazonaws.com/census.csv')
+    label = dataset.pop('label')
+
+    X_train, X_test, y_train, y_test = train_test_split(dataset, label, stratify=label)
     pipeline.fit(X_train, y_train)
 
 Once we have fitted our model to our data, we can call the ``predict`` method passing new data
@@ -115,9 +119,11 @@ to obtain predictions from the pipeline.
 .. ipython:: python
     :okwarning:
 
+    from sklearn.metrics import accuracy_score
+
     predictions = pipeline.predict(X_test)
     predictions
-    dataset.score(y_test, predictions)
+    accuracy_score(y_test, predictions)
 
 .. _you have already installed them: install.html#additional-dependencies
 .. _MLPipeline class: ../api_reference.html#mlblocks.MLPipeline
diff --git a/examples/tutorials/1. Using and MLPipeline.ipynb b/examples/tutorials/1. Using and MLPipeline.ipynb
index dab130ea..901cc50b 100644
--- a/examples/tutorials/1. Using and MLPipeline.ipynb	
+++ b/examples/tutorials/1. Using and MLPipeline.ipynb	
@@ -33,9 +33,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from mlprimitives.datasets import load_dataset\n",
+    "from utils import load_census\n",
     "\n",
-    "dataset = load_dataset('census')"
+    "dataset = load_census()"
    ]
   },
   {
@@ -528,7 +528,16 @@
    "cell_type": "code",
    "execution_count": 13,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
    "source": [
     "pipeline.fit(X_train, y_train)"
    ]
@@ -546,9 +555,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "predictions = pipeline.predict(X_test)"
@@ -611,7 +618,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -625,7 +632,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.8.16"
   }
  },
  "nbformat": 4,
diff --git a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb
index 4993fd4e..7aa0ab2b 100644
--- a/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb	
+++ b/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb	
@@ -37,9 +37,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from mlprimitives.datasets import load_dataset\n",
+    "from utils import load_census\n",
     "\n",
-    "dataset = load_dataset('census')\n",
+    "dataset = load_census()\n",
     "X_train, X_test, y_train, y_test = dataset.get_splits(1)"
    ]
   },
@@ -268,6 +268,14 @@
    "execution_count": 7,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
+      "  warnings.warn(\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
@@ -394,6 +402,14 @@
    "execution_count": 11,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
+      "  warnings.warn(\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
@@ -415,7 +431,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -429,7 +445,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.8.16"
   }
  },
  "nbformat": 4,
diff --git a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb
index 01a58cd5..ec1c6f97 100644
--- a/examples/tutorials/4. Saving and Loading a Pipeline.ipynb	
+++ b/examples/tutorials/4. Saving and Loading a Pipeline.ipynb	
@@ -35,9 +35,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from mlprimitives.datasets import load_dataset\n",
+    "from utils import load_census\n",
     "\n",
-    "dataset = load_dataset('census')"
+    "dataset = load_census()"
    ]
   },
   {
@@ -71,7 +71,16 @@
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
    "source": [
     "pipeline.fit(X_train, y_train)"
    ]
@@ -166,7 +175,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -180,7 +189,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.8.16"
   }
  },
  "nbformat": 4,
diff --git a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb
index 57b2b43c..769a69c1 100644
--- a/examples/tutorials/5. Partial execution and pipeline debugging.ipynb	
+++ b/examples/tutorials/5. Partial execution and pipeline debugging.ipynb	
@@ -36,9 +36,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from mlprimitives.datasets import load_dataset\n",
+    "from utils import load_census\n",
     "\n",
-    "dataset = load_dataset('census')"
+    "dataset = load_census()"
    ]
   },
   {
@@ -430,7 +430,16 @@
    "cell_type": "code",
    "execution_count": 11,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
    "source": [
     "fit_context = pipeline.fit(start_=1, output_=2, **fit_context)"
    ]
@@ -690,7 +699,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -704,7 +713,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.8.16"
   }
  },
  "nbformat": 4,
diff --git a/examples/tutorials/6. Flexible outputs specification.ipynb b/examples/tutorials/6. Flexible outputs specification.ipynb
index ca1048dd..6ecad5a5 100644
--- a/examples/tutorials/6. Flexible outputs specification.ipynb	
+++ b/examples/tutorials/6. Flexible outputs specification.ipynb	
@@ -37,9 +37,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from mlprimitives.datasets import load_dataset\n",
+    "from utils import load_census\n",
     "\n",
-    "dataset = load_dataset('census')"
+    "dataset = load_census()"
    ]
   },
   {
@@ -420,7 +420,16 @@
    "cell_type": "code",
    "execution_count": 10,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
    "source": [
     "output_spec = [\n",
     "    'sklearn.impute.SimpleImputer#1.X',\n",
@@ -441,7 +450,16 @@
    "cell_type": "code",
    "execution_count": 11,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
    "source": [
     "output_spec = [\n",
     "    'mlprimitives.custom.feature_extraction.CategoricalEncoder#1.X',\n",
@@ -495,7 +513,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -509,7 +527,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.8.16"
   }
  },
  "nbformat": 4,
diff --git a/examples/tutorials/7. Tuning a Pipeline.ipynb b/examples/tutorials/7. Tuning a Pipeline.ipynb
index ca30df17..7a288a46 100644
--- a/examples/tutorials/7. Tuning a Pipeline.ipynb	
+++ b/examples/tutorials/7. Tuning a Pipeline.ipynb	
@@ -34,9 +34,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from mlprimitives.datasets import load_dataset\n",
+    "from utils import load_census\n",
     "\n",
-    "dataset = load_dataset('census')"
+    "dataset = load_census()"
    ]
   },
   {
diff --git a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
index 44431d4f..80ad93fb 100644
--- a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb	
+++ b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb	
@@ -37,9 +37,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from mlprimitives.datasets import load_dataset\n",
+    "from utils import load_census\n",
     "\n",
-    "dataset = load_dataset('census')"
+    "dataset = load_census()"
    ]
   },
   {
@@ -309,9 +309,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -536,9 +534,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -707,9 +703,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -772,7 +766,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -786,7 +780,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.8.16"
   }
  },
  "nbformat": 4,
diff --git a/examples/tutorials/utils.py b/examples/tutorials/utils.py
new file mode 100644
index 00000000..32b210a7
--- /dev/null
+++ b/examples/tutorials/utils.py
@@ -0,0 +1,52 @@
+import io
+import os
+
+import pandas as pd
+from sklearn.metrics import accuracy_score
+from mlprimitives.datasets import Dataset
+
+DATA_PATH = os.path.join(
+    os.path.dirname(__file__),
+    'data'
+)
+
+DATA_URL = 'http://mlblocks.s3.amazonaws.com/%7B%7D.csv'
+
+def _download(dataset_name, dataset_path):
+    url = DATA_URL.format(dataset_name)
+
+    data = pd.read_csv(url)
+    data.to_csv(dataset_path, index=False)
+
+def _load(dataset_name):
+    if not os.path.exists(DATA_PATH):
+        os.makedirs(DATA_PATH)
+
+    dataset_path = os.path.join(DATA_PATH, dataset_name + '.csv')
+    if not os.path.exists(dataset_path):
+        _download(dataset_name, dataset_path)
+
+    return dataset_path
+
+def load_census():
+    """Adult Census dataset.
+
+    Predict whether income exceeds $50K/yr based on census data. Also known as "Adult" dataset.
+
+    Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean
+    records was extracted using the following conditions: ((AAGE>16) && (AGI>100) &&
+    (AFNLWGT>1)&& (HRSWK>0))
+
+    Prediction task is to determine whether a person makes over 50K a year.
+
+    source: "UCI
+    sourceURI: "https://archive.ics.uci.edu/ml/datasets/census+income"
+    """
+
+    dataset_path = _load('census_train')
+
+    X = pd.read_csv(dataset_path)
+    y = X.pop('label').values
+
+    return Dataset(load_census.__doc__, X, y, accuracy_score, 'single_table',
+                   'classification', 'binary', stratify=True)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index c9658a63..3df32765 100644
--- a/setup.py
+++ b/setup.py
@@ -98,6 +98,7 @@
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
+	'Programming Language :: Python :: 3.11',
     ],
     description='Pipelines and primitives for machine learning and data science.',
     extras_require={
@@ -115,7 +116,7 @@
     long_description_content_type='text/markdown',
     name='mlblocks',
     packages=find_packages(include=['mlblocks', 'mlblocks.*']),
-    python_requires='>=3.6,<3.11',
+    python_requires='>=3.6,<3.12',
     setup_requires=setup_requires,
     test_suite='tests',
     tests_require=tests_require,
diff --git a/tox.ini b/tox.ini
index a589526a..27e499ed 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,8 +1,9 @@
 [tox]
-envlist = py3{6,7,8,9,10}, test-devel
+envlist = py3{6,7,8,9,10,11}, test-devel
 
 [travis]
 python =
+    3.11: py11
     3.10: py10
     3.9: py39
     3.8: py38, test-devel

From d401d1026dec4c60a4daed19d97daee58f5b573c Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <sarahalnegheimish@gmail.com>
Date: Tue, 26 Sep 2023 11:25:02 -0400
Subject: [PATCH 152/160] =?UTF-8?q?Bump=20version:=200.6.1.dev0=20?=
 =?UTF-8?q?=E2=86=92=200.6.1.dev1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 021d9734..86777d40 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.6.1.dev0'
+__version__ = '0.6.1.dev1'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 40e7b099..33532996 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.1.dev0
+current_version = 0.6.1.dev1
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 3df32765..3575b6d0 100644
--- a/setup.py
+++ b/setup.py
@@ -121,6 +121,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.6.1.dev0',
+    version='0.6.1.dev1',
     zip_safe=False,
 )

From 76a0b5767006aad76ccf8761c3c4d6f3bf0c642a Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <sarahalnegheimish@gmail.com>
Date: Tue, 26 Sep 2023 13:41:41 -0400
Subject: [PATCH 153/160] add release notes

---
 HISTORY.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index f1c4209f..1fcf520f 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,11 @@
 Changelog
 =========
 
+0.6.1 - 2023-09-26
+------------------
+
+* Add python 3.11 to MLBlocks - [Issue #143](https://github.com/MLBazaar/MLBlocks/issues/143) by @sarahmish
+
 0.6.0 - 2023-04-14
 ------------------
 

From 4d8c9d5742f4b3901eb4d49aa8c6b66756ccc6a4 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <sarahalnegheimish@gmail.com>
Date: Tue, 26 Sep 2023 13:41:51 -0400
Subject: [PATCH 154/160] =?UTF-8?q?Bump=20version:=200.6.1.dev1=20?=
 =?UTF-8?q?=E2=86=92=200.6.1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 86777d40..4646fd8b 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.6.1.dev1'
+__version__ = '0.6.1'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 33532996..e02d1a91 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.1.dev1
+current_version = 0.6.1
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 3575b6d0..4b211e2b 100644
--- a/setup.py
+++ b/setup.py
@@ -121,6 +121,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.6.1.dev1',
+    version='0.6.1',
     zip_safe=False,
 )

From 1658ee0552e678e6b6c04c394e22d8e60a8e7112 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <sarahalnegheimish@gmail.com>
Date: Tue, 26 Sep 2023 13:42:11 -0400
Subject: [PATCH 155/160] =?UTF-8?q?Bump=20version:=200.6.1=20=E2=86=92=200?=
 =?UTF-8?q?.6.2.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 4646fd8b..f42e9f83 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.6.1'
+__version__ = '0.6.2.dev0'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index e02d1a91..d582e738 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.1
+current_version = 0.6.2.dev0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 4b211e2b..c741eadc 100644
--- a/setup.py
+++ b/setup.py
@@ -121,6 +121,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.6.1',
+    version='0.6.2.dev0',
     zip_safe=False,
 )

From cf3bd258842864b9f8996dd7e1e0e735d635eb5c Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com>
Date: Sun, 17 Nov 2024 12:03:59 -0500
Subject: [PATCH 156/160] Upgrade python version to include 3.12 and 3.13
 (#144)

* update python

* update dependencies

* mute simpleimputer for now

* edit docs

* change image

* restore tutorials

* fix devel tests

* change btb to baytune

* fix python specification

* cap copulas at 0.11

* update readme
---
 .github/workflows/tests.yml                   |  12 +-
 README.md                                     |   2 +-
 examples/tutorials/7. Tuning a Pipeline.ipynb |  31 ++-
 ...or the best pipeline with BTBSession.ipynb | 241 ++++++------------
 setup.py                                      |  32 ++-
 tox.ini                                       |   8 +-
 6 files changed, 126 insertions(+), 200 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 0eb00220..cbadf809 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: [3.8]
+        python-version: ['3.10']
         os: [ubuntu-latest]
     steps:
     - uses: actions/checkout@v1
@@ -20,7 +20,7 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
     - name: Upgrade pip
-      run: pip install -U pip setuptools wheel
+      run: pip install -U "pip<=24.1" setuptools wheel
     - name: Install lightfm
       run: python -m pip install --no-use-pep517 'lightfm<2'
     - name: Install package
@@ -32,7 +32,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8]
+        python-version: ['3.8', '3.9', '3.10', '3.11']
         os: [ubuntu-20.04, macos-latest]
     steps:
     - uses: actions/checkout@v1
@@ -53,7 +53,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11']
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
         os: [ubuntu-20.04, macos-latest]
     steps:
     - uses: actions/checkout@v1
@@ -70,7 +70,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8]
+        python-version: ['3.8', '3.9', '3.10', '3.11']
         os: [ubuntu-20.04, macos-latest]
     steps:
     - uses: actions/checkout@v1
@@ -91,7 +91,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8]
+        python-version: ['3.8', '3.9', '3.10', '3.11']
         os: [ubuntu-20.04]
     steps:
     - uses: actions/checkout@v1
diff --git a/README.md b/README.md
index 662a3ed3..fb5ba341 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ Features include:
 
 ## Requirements
 
-**MLBlocks** has been developed and tested on [Python 3.6, 3.7, 3.8, 3.9, and 3.10](https://www.python.org/downloads/)
+**MLBlocks** has been developed and tested on [Python 3.8, 3.9, 3.10, 3.11, 3.12, 3.13](https://www.python.org/downloads/)
 
 ## Install with `pip`
 
diff --git a/examples/tutorials/7. Tuning a Pipeline.ipynb b/examples/tutorials/7. Tuning a Pipeline.ipynb
index 7a288a46..484e0b22 100644
--- a/examples/tutorials/7. Tuning a Pipeline.ipynb	
+++ b/examples/tutorials/7. Tuning a Pipeline.ipynb	
@@ -181,7 +181,7 @@
     {
      "data": {
       "text/plain": [
-       "0.8639171383183359"
+       "0.863978563379761"
       ]
      },
      "execution_count": 6,
@@ -210,7 +210,7 @@
     {
      "data": {
       "text/plain": [
-       "0.8686773872402614"
+       "0.868554574842"
       ]
      },
      "execution_count": 7,
@@ -242,7 +242,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from btb.tuning import Tunable\n",
+    "from baytune.tuning import Tunable\n",
     "\n",
     "tunable = Tunable.from_dict(tunable_hyperparameters)"
    ]
@@ -265,7 +265,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from btb.tuning import GPTuner\n",
+    "from baytune.tuning import GPTuner\n",
     "\n",
     "tuner = GPTuner(tunable)"
    ]
@@ -345,16 +345,15 @@
      "output_type": "stream",
      "text": [
       "scoring pipeline 1\n",
+      "New best found: 0.871994161365419\n",
       "scoring pipeline 2\n",
+      "New best found: 0.8723319756253888\n",
       "scoring pipeline 3\n",
       "scoring pipeline 4\n",
-      "New best found: 0.8642241881762839\n",
       "scoring pipeline 5\n",
       "scoring pipeline 6\n",
       "scoring pipeline 7\n",
-      "New best found: 0.8644390957265209\n",
       "scoring pipeline 8\n",
-      "New best found: 0.8679095503945804\n",
       "scoring pipeline 9\n",
       "scoring pipeline 10\n"
      ]
@@ -395,13 +394,13 @@
      "data": {
       "text/plain": [
        "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
-       "  'max_labels'): 39,\n",
-       " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'most_frequent',\n",
-       " ('xgboost.XGBClassifier#1', 'n_estimators'): 70,\n",
-       " ('xgboost.XGBClassifier#1', 'max_depth'): 6,\n",
-       " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.07406443671152008,\n",
-       " ('xgboost.XGBClassifier#1', 'gamma'): 0.9244108160038952,\n",
-       " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}"
+       "  'max_labels'): 60,\n",
+       " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
+       " ('xgboost.XGBClassifier#1', 'n_estimators'): 190,\n",
+       " ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n",
+       " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.13575511242790694,\n",
+       " ('xgboost.XGBClassifier#1', 'gamma'): 0.6326488945712287,\n",
+       " ('xgboost.XGBClassifier#1', 'min_child_weight'): 8}"
       ]
      },
      "execution_count": 13,
@@ -443,7 +442,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -457,7 +456,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.10.15"
   }
  },
  "nbformat": 4,
diff --git a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb
index 80ad93fb..a7e9d69a 100644
--- a/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb	
+++ b/examples/tutorials/8. Searching for the best pipeline with BTBSession.ipynb	
@@ -157,7 +157,7 @@
     {
      "data": {
       "text/plain": [
-       "<mlblocks.mlpipeline.MLPipeline at 0x7fc8721f0828>"
+       "<mlblocks.mlpipeline.MLPipeline at 0x293518790>"
       ]
      },
      "execution_count": 6,
@@ -286,7 +286,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from btb.session import BTBSession\n",
+    "from baytune.session import BTBSession\n",
     "\n",
     "session = BTBSession(tunables, cross_validate, verbose=True)"
    ]
@@ -314,12 +314,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "342fe40f08024adcb5b60eea25f49d37",
+       "model_id": "00c20e4b982f42a1873c0d12f550ee4b",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))"
+       "  0%|          | 0/5 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -329,40 +329,31 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2020-09-16 16:32:40,826 - INFO - btb.session - Creating Tunable instance from dict.\n",
-      "2020-09-16 16:32:40,827 - INFO - btb.session - Obtaining default configuration for single_table.classification\n",
-      "2020-09-16 16:32:46,432 - INFO - btb.session - New optimal found: single_table.classification - 0.8639171383183359\n",
-      "2020-09-16 16:32:46,435 - INFO - btb.session - Creating Tunable instance from dict.\n",
-      "2020-09-16 16:32:46,436 - INFO - btb.session - Obtaining default configuration for single_table.classification.text\n",
-      "2020-09-16 16:32:46,583 - ERROR - mlblocks.mlpipeline - Exception caught producing MLBlock mlprimitives.custom.text.TextCleaner#1\n",
+      "Exception caught producing MLBlock mlprimitives.custom.text.TextCleaner#1\n",
       "Traceback (most recent call last):\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n",
-      "    return self._engine.get_loc(key)\n",
-      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "  File \"/opt/anaconda3/envs/py10/lib/python3.10/site-packages/pandas/core/indexes/base.py\", line 3802, in get_loc\n",
+      "    return self._engine.get_loc(casted_key)\n",
+      "  File \"pandas/_libs/index.pyx\", line 138, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/index.pyx\", line 165, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
       "KeyError: 'text'\n",
       "\n",
-      "During handling of the above exception, another exception occurred:\n",
+      "The above exception was the direct cause of the following exception:\n",
       "\n",
       "Traceback (most recent call last):\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlpipeline.py\", line 645, in _produce_block\n",
+      "  File \"/Users/sarah/Documents/git-repos/MLBlocks/mlblocks/mlpipeline.py\", line 679, in _produce_block\n",
       "    block_outputs = block.produce(**produce_args)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlblock.py\", line 322, in produce\n",
+      "  File \"/Users/sarah/Documents/git-repos/MLBlocks/mlblocks/mlblock.py\", line 331, in produce\n",
       "    return getattr(self.instance, self.produce_method)(**produce_kwargs)\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/mlprimitives/custom/text.py\", line 111, in produce\n",
+      "  File \"/Users/sarah/Documents/git-repos/MLPrimitives/mlprimitives/custom/text.py\", line 111, in produce\n",
       "    texts = X[self.column]\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n",
+      "  File \"/opt/anaconda3/envs/py10/lib/python3.10/site-packages/pandas/core/frame.py\", line 3807, in __getitem__\n",
       "    indexer = self.columns.get_loc(key)\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n",
-      "    return self._engine.get_loc(self._maybe_cast_indexer(key))\n",
-      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "  File \"/opt/anaconda3/envs/py10/lib/python3.10/site-packages/pandas/core/indexes/base.py\", line 3804, in get_loc\n",
+      "    raise KeyError(key) from err\n",
       "KeyError: 'text'\n",
-      "2020-09-16 16:32:46,586 - ERROR - btb.session - Proposal 2 - single_table.classification.text crashed with the following configuration: ('mlprimitives.custom.text.TextCleaner#1', 'lower'): True\n",
+      "Proposal 2 - single_table.classification.text crashed with the following configuration: ('mlprimitives.custom.text.TextCleaner#1', 'lower'): True\n",
       "('mlprimitives.custom.text.TextCleaner#1', 'accents'): True\n",
       "('mlprimitives.custom.text.TextCleaner#1', 'stopwords'): True\n",
       "('mlprimitives.custom.text.TextCleaner#1', 'non_alpha'): True\n",
@@ -383,73 +374,51 @@
       "('sklearn.ensemble.RandomForestClassifier#1', 'bootstrap'): True\n",
       "('sklearn.ensemble.RandomForestClassifier#1', 'oob_score'): False\n",
       "Traceback (most recent call last):\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2657, in get_loc\n",
-      "    return self._engine.get_loc(key)\n",
-      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "  File \"/opt/anaconda3/envs/py10/lib/python3.10/site-packages/pandas/core/indexes/base.py\", line 3802, in get_loc\n",
+      "    return self._engine.get_loc(casted_key)\n",
+      "  File \"pandas/_libs/index.pyx\", line 138, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/index.pyx\", line 165, in pandas._libs.index.IndexEngine.get_loc\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
       "KeyError: 'text'\n",
       "\n",
-      "During handling of the above exception, another exception occurred:\n",
+      "The above exception was the direct cause of the following exception:\n",
       "\n",
       "Traceback (most recent call last):\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/btb/session.py\", line 336, in run\n",
+      "  File \"/opt/anaconda3/envs/py10/lib/python3.10/site-packages/baytune/session.py\", line 364, in run\n",
       "    score = self._scorer(tunable_name, config)\n",
-      "  File \"<ipython-input-7-067b925bbee5>\", line 11, in cross_validate\n",
+      "  File \"/var/folders/by/d1f3gk0x14v54qggfxmjbn1c0000gn/T/ipykernel_19852/2674531477.py\", line 11, in cross_validate\n",
       "    pipeline.fit(X_train, y_train)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlpipeline.py\", line 754, in fit\n",
-      "    block, block_name, context, output_variables, outputs, debug_info)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlpipeline.py\", line 645, in _produce_block\n",
+      "  File \"/Users/sarah/Documents/git-repos/MLBlocks/mlblocks/mlpipeline.py\", line 805, in fit\n",
+      "    self._produce_block(\n",
+      "  File \"/Users/sarah/Documents/git-repos/MLBlocks/mlblocks/mlpipeline.py\", line 679, in _produce_block\n",
       "    block_outputs = block.produce(**produce_args)\n",
-      "  File \"/home/xals/Projects/MIT/MLBlocks.clean/mlblocks/mlblock.py\", line 322, in produce\n",
+      "  File \"/Users/sarah/Documents/git-repos/MLBlocks/mlblocks/mlblock.py\", line 331, in produce\n",
       "    return getattr(self.instance, self.produce_method)(**produce_kwargs)\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/mlprimitives/custom/text.py\", line 111, in produce\n",
+      "  File \"/Users/sarah/Documents/git-repos/MLPrimitives/mlprimitives/custom/text.py\", line 111, in produce\n",
       "    texts = X[self.column]\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/frame.py\", line 2927, in __getitem__\n",
+      "  File \"/opt/anaconda3/envs/py10/lib/python3.10/site-packages/pandas/core/frame.py\", line 3807, in __getitem__\n",
       "    indexer = self.columns.get_loc(key)\n",
-      "  File \"/home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/pandas/core/indexes/base.py\", line 2659, in get_loc\n",
-      "    return self._engine.get_loc(self._maybe_cast_indexer(key))\n",
-      "  File \"pandas/_libs/index.pyx\", line 108, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/index.pyx\", line 132, in pandas._libs.index.IndexEngine.get_loc\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
-      "  File \"pandas/_libs/hashtable_class_helper.pxi\", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item\n",
+      "  File \"/opt/anaconda3/envs/py10/lib/python3.10/site-packages/pandas/core/indexes/base.py\", line 3804, in get_loc\n",
+      "    raise KeyError(key) from err\n",
       "KeyError: 'text'\n",
-      "2020-09-16 16:32:46,587 - WARNING - btb.session - Too many errors: 1. Removing tunable single_table.classification.text\n",
-      "2020-09-16 16:32:46,589 - INFO - btb.session - Creating Tunable instance from dict.\n",
-      "2020-09-16 16:32:46,589 - INFO - btb.session - Obtaining default configuration for single_table.classification.xgb\n",
-      "2020-09-16 16:32:52,100 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
-      "2020-09-16 16:33:28,900 - INFO - btb.session - New optimal found: single_table.classification - 0.8728234138413778\n",
-      "2020-09-16 16:33:28,904 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
+      "Too many errors: 1. Removing tunable single_table.classification.text\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "{'id': '7e662f9b90f0e123939b7532ecc221c7',\n",
-       " 'name': 'single_table.classification',\n",
+       "{'id': '0ebe8af9c06a05f39821de36d6c9ffc2',\n",
+       " 'name': 'single_table.classification.xgb',\n",
        " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
-       "   'max_labels'): 63,\n",
-       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "   'lowercase'): True,\n",
-       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "   'binary'): True,\n",
-       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "   'max_features'): 7315,\n",
+       "   'max_labels'): 52,\n",
        "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'median',\n",
-       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 879,\n",
-       "  ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
-       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23231879890615814,\n",
-       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.5474914147721585,\n",
-       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 3},\n",
-       " 'score': 0.8728234138413778}"
+       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 313,\n",
+       "  ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n",
+       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.7119589664956909,\n",
+       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.944854007471167,\n",
+       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n",
+       " 'score': 0.8641320270062784}"
       ]
      },
      "execution_count": 11,
@@ -489,23 +458,17 @@
     {
      "data": {
       "text/plain": [
-       "{'id': '7e662f9b90f0e123939b7532ecc221c7',\n",
-       " 'name': 'single_table.classification',\n",
+       "{'id': '0ebe8af9c06a05f39821de36d6c9ffc2',\n",
+       " 'name': 'single_table.classification.xgb',\n",
        " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
-       "   'max_labels'): 63,\n",
-       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "   'lowercase'): True,\n",
-       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "   'binary'): True,\n",
-       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "   'max_features'): 7315,\n",
+       "   'max_labels'): 52,\n",
        "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'median',\n",
-       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 879,\n",
-       "  ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
-       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23231879890615814,\n",
-       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.5474914147721585,\n",
-       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 3},\n",
-       " 'score': 0.8728234138413778}"
+       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 313,\n",
+       "  ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n",
+       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.7119589664956909,\n",
+       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.944854007471167,\n",
+       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n",
+       " 'score': 0.8641320270062784}"
       ]
      },
      "execution_count": 12,
@@ -539,71 +502,31 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "8dd5d4626f304c279b2b368a671b6cb7",
+       "model_id": "a0dbe69a0340455a937f7376f7723ec4",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))"
+       "  0%|          | 0/10 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
      "output_type": "display_data"
     },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2020-09-16 16:34:46,679 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
-      "2020-09-16 16:35:39,310 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
-      "2020-09-16 16:36:53,519 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
-      "2020-09-16 16:37:31,639 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
-      "2020-09-16 16:37:34,254 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
-      "2020-09-16 16:38:33,930 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
-      "2020-09-16 16:38:46,228 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
-      "2020-09-16 16:39:09,193 - INFO - btb.session - New optimal found: single_table.classification - 0.8730998313333643\n",
-      "2020-09-16 16:39:09,199 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
-      "2020-09-16 16:40:06,793 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
-      "2020-09-16 16:40:44,917 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
-      "2020-09-16 16:41:19,357 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
-      "2020-09-16 16:41:29,076 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
-      "2020-09-16 16:41:46,742 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
-      "2020-09-16 16:42:24,199 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
-      "2020-09-16 16:42:37,998 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
-      "2020-09-16 16:43:03,272 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
-      "2020-09-16 16:44:01,301 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n",
-      "2020-09-16 16:44:12,500 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
-      "2020-09-16 16:44:32,221 - INFO - btb.session - Generating new proposal configuration for single_table.classification.xgb\n",
-      "2020-09-16 16:45:20,148 - INFO - btb.session - Generating new proposal configuration for single_table.classification\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    },
     {
      "data": {
       "text/plain": [
-       "{'id': '52f65be5a78a6c557b8c5bf868bfdb7d',\n",
-       " 'name': 'single_table.classification',\n",
+       "{'id': '0e379b2b0932f77d9b541925a05716be',\n",
+       " 'name': 'single_table.classification.xgb',\n",
        " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
-       "   'max_labels'): 97,\n",
-       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "   'lowercase'): True,\n",
-       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "   'binary'): True,\n",
-       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "   'max_features'): 270,\n",
-       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'constant',\n",
-       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 556,\n",
-       "  ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
-       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.4023947989981499,\n",
-       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.9595910516937898,\n",
-       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 6},\n",
-       " 'score': 0.8730998313333643}"
+       "   'max_labels'): 43,\n",
+       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'median',\n",
+       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 609,\n",
+       "  ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n",
+       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.16947366722929258,\n",
+       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.8805192101300107,\n",
+       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n",
+       " 'score': 0.8727005495718071}"
       ]
      },
      "execution_count": 13,
@@ -647,23 +570,17 @@
     {
      "data": {
       "text/plain": [
-       "{'id': '52f65be5a78a6c557b8c5bf868bfdb7d',\n",
-       " 'name': 'single_table.classification',\n",
+       "{'id': '0e379b2b0932f77d9b541925a05716be',\n",
+       " 'name': 'single_table.classification.xgb',\n",
        " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
-       "   'max_labels'): 97,\n",
-       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "   'lowercase'): True,\n",
-       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "   'binary'): True,\n",
-       "  ('mlprimitives.custom.feature_extraction.StringVectorizer#1',\n",
-       "   'max_features'): 270,\n",
-       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'constant',\n",
-       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 556,\n",
-       "  ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
-       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.4023947989981499,\n",
-       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.9595910516937898,\n",
-       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 6},\n",
-       " 'score': 0.8730998313333643}"
+       "   'max_labels'): 43,\n",
+       "  ('sklearn.impute.SimpleImputer#1', 'strategy'): 'median',\n",
+       "  ('xgboost.XGBClassifier#1', 'n_estimators'): 609,\n",
+       "  ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n",
+       "  ('xgboost.XGBClassifier#1', 'learning_rate'): 0.16947366722929258,\n",
+       "  ('xgboost.XGBClassifier#1', 'gamma'): 0.8805192101300107,\n",
+       "  ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n",
+       " 'score': 0.8727005495718071}"
       ]
      },
      "execution_count": 14,
@@ -724,7 +641,7 @@
        "   ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
        "   ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n",
        "   ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n",
-       "  'score': 0.8639171383183359},\n",
+       "  'score': 0.863978563379761},\n",
        " {'id': 'adbd189a819483ddc869ceb94513b369',\n",
        "  'name': 'single_table.classification.text',\n",
        "  'config': {('mlprimitives.custom.text.TextCleaner#1', 'lower'): True,\n",
@@ -780,7 +697,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.16"
+   "version": "3.10.15"
   }
  },
  "nbformat": 4,
diff --git a/setup.py b/setup.py
index c741eadc..cc59e712 100644
--- a/setup.py
+++ b/setup.py
@@ -14,21 +14,22 @@
 
 install_requires = [
     'graphviz>=0.9,<1',
-    'numpy>=1.17.1,<2',
-    'psutil>=5,<6',
+    'numpy>=1.17.1,<3',
+    'psutil>=5,<7',
 ]
 
 
 mlprimitives_requires = [
-    'mlprimitives>=0.3.0,<0.4',
-    'h5py<2.11.0,>=2.10.0',  # <- tensorflow 2.3.2 conflict
-    'matplotlib<3.2.2,>=2.2.2',  # <- copulas 0.3.3
+    'mlprimitives>=0.4.0,<0.5',
+    'h5py<4,>=2.10.0',  # <- tensorflow 2.3.2 conflict
+    'matplotlib<4,>=2.2.2',  # <- copulas 0.3.3
     'protobuf<4', # <- importlib
 ]
 
 examples_require = mlprimitives_requires + [
     'jupyter==1.0.0',
-    'baytune>=0.4.0,<0.5',
+    'baytune>=0.5.0,<0.6',
+    'copulas<0.12',
 ]
 
 
@@ -50,7 +51,7 @@
     # general
     'bumpversion>=0.5.3,<0.6',
     'pip>=9.0.1',
-    'watchdog>=0.8.3,<0.11',
+    'watchdog>=0.8.3,<5',
 
     # docs
     'm2r>=0.2.0,<0.3',
@@ -62,6 +63,15 @@
     'Jinja2>=2,<3', # >=3 makes sphinx theme fail
     'markupsafe<2.1.0',
 
+    # fails on Sphinx < v3.4
+    'alabaster<=0.7.12',
+    # fails on Sphins < v5.0
+    'sphinxcontrib-applehelp<1.0.8',
+    'sphinxcontrib-devhelp<1.0.6',
+    'sphinxcontrib-htmlhelp<2.0.5',
+    'sphinxcontrib-serializinghtml<1.1.10',
+    'sphinxcontrib-qthelp<1.0.7',
+
     # style check
     'flake8>=3.7.7,<4',
     'isort>=4.3.4,<5',
@@ -93,12 +103,12 @@
         'License :: OSI Approved :: MIT License',
         'Natural Language :: English',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
-	'Programming Language :: Python :: 3.11',
+        'Programming Language :: Python :: 3.11',
+        'Programming Language :: Python :: 3.12',
+        'Programming Language :: Python :: 3.13',
     ],
     description='Pipelines and primitives for machine learning and data science.',
     extras_require={
@@ -116,7 +126,7 @@
     long_description_content_type='text/markdown',
     name='mlblocks',
     packages=find_packages(include=['mlblocks', 'mlblocks.*']),
-    python_requires='>=3.6,<3.12',
+    python_requires='>=3.8,<3.14',
     setup_requires=setup_requires,
     test_suite='tests',
     tests_require=tests_require,
diff --git a/tox.ini b/tox.ini
index 27e499ed..cdaadc29 100644
--- a/tox.ini
+++ b/tox.ini
@@ -3,12 +3,12 @@ envlist = py3{6,7,8,9,10,11}, test-devel
 
 [travis]
 python =
-    3.11: py11
-    3.10: py10
+    3.13: py313
+    3.12: py312
+    3.11: py311
+    3.10: py310
     3.9: py39
     3.8: py38, test-devel
-    3.7: py37
-    3.6: py36
 
 [testenv]
 passenv = CI TRAVIS TRAVIS_*

From a38b46a1fa4ae19998437759e34da9941f2066f6 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <sarahalnegheimish@gmail.com>
Date: Mon, 18 Nov 2024 11:53:06 -0500
Subject: [PATCH 157/160] =?UTF-8?q?Bump=20version:=200.6.2.dev0=20?=
 =?UTF-8?q?=E2=86=92=200.6.2.dev1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index f42e9f83..0fbf2c2f 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.6.2.dev0'
+__version__ = '0.6.2.dev1'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index d582e738..9be18137 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.2.dev0
+current_version = 0.6.2.dev1
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index cc59e712..0ff336d1 100644
--- a/setup.py
+++ b/setup.py
@@ -131,6 +131,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.6.2.dev0',
+    version='0.6.2.dev1',
     zip_safe=False,
 )

From 1b3ffbce7379d20548501a472cd2c8331d67e1e2 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <sarahalnegheimish@gmail.com>
Date: Mon, 18 Nov 2024 13:45:18 -0500
Subject: [PATCH 158/160] add release notes

---
 HISTORY.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index 1fcf520f..97c363f3 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,11 @@
 Changelog
 =========
 
+0.6.2 - 2024-11-18
+------------------
+
+* Upgrade python version to include 3.12 and 3.13 - [Issue #144](https://github.com/MLBazaar/MLBlocks/issues/144) by @sarahmish
+
 0.6.1 - 2023-09-26
 ------------------
 

From 1406d3783b2bdd4f36f6243590207fdf1b6f668a Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <sarahalnegheimish@gmail.com>
Date: Mon, 18 Nov 2024 13:45:22 -0500
Subject: [PATCH 159/160] =?UTF-8?q?Bump=20version:=200.6.2.dev1=20?=
 =?UTF-8?q?=E2=86=92=200.6.2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 0fbf2c2f..22734701 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.6.2.dev1'
+__version__ = '0.6.2'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index 9be18137..dfc0a44b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.2.dev1
+current_version = 0.6.2
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index 0ff336d1..ee4f2884 100644
--- a/setup.py
+++ b/setup.py
@@ -131,6 +131,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.6.2.dev1',
+    version='0.6.2',
     zip_safe=False,
 )

From db5ff4b925358ef568492b45058dddded05be873 Mon Sep 17 00:00:00 2001
From: Sarah Alnegheimish <sarahalnegheimish@gmail.com>
Date: Mon, 18 Nov 2024 13:45:35 -0500
Subject: [PATCH 160/160] =?UTF-8?q?Bump=20version:=200.6.2=20=E2=86=92=200?=
 =?UTF-8?q?.6.3.dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mlblocks/__init__.py | 2 +-
 setup.cfg            | 2 +-
 setup.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlblocks/__init__.py b/mlblocks/__init__.py
index 22734701..fa7130da 100644
--- a/mlblocks/__init__.py
+++ b/mlblocks/__init__.py
@@ -20,7 +20,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.6.2'
+__version__ = '0.6.3.dev0'
 
 __all__ = [
     'MLBlock',
diff --git a/setup.cfg b/setup.cfg
index dfc0a44b..8908f680 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.2
+current_version = 0.6.3.dev0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
index ee4f2884..e4ab47c9 100644
--- a/setup.py
+++ b/setup.py
@@ -131,6 +131,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/MLBazaar/MLBlocks',
-    version='0.6.2',
+    version='0.6.3.dev0',
     zip_safe=False,
 )