sintel-dev · sarahmish · Jan 18, 2023 · Nov 17, 2022 · Nov 30, 2022 · Dec 2, 2022
diff --git a/notebooks/data/feature_matrix.csv b/notebooks/data/feature_matrix.csv
@@ -0,0 +1,14 @@
+COUNT(alarms),MAX(alarms.IND_DURATION),MIN(alarms.IND_DURATION),SUM(alarms.IND_DURATION),COUNT(stoppages),MAX(stoppages.COD_WO),MAX(stoppages.IND_DURATION),MAX(stoppages.IND_LOST_GEN),MIN(stoppages.COD_WO),MIN(stoppages.IND_DURATION),MIN(stoppages.IND_LOST_GEN),SUM(stoppages.COD_WO),SUM(stoppages.IND_DURATION),SUM(stoppages.IND_LOST_GEN),COUNT(scada),MAX(scada.val1),MAX(scada.val2),MIN(scada.val1),MIN(scada.val2),SUM(scada.val1),SUM(scada.val2),MAX(stoppages.NUM_WORDS(DES_COMMENTS)),MAX(stoppages.NUM_WORDS(DES_DESCRIPTION)),MAX(stoppages.NUM_WORDS(DES_WO_NAME)),MIN(stoppages.NUM_WORDS(DES_COMMENTS)),MIN(stoppages.NUM_WORDS(DES_DESCRIPTION)),MIN(stoppages.NUM_WORDS(DES_WO_NAME)),SUM(stoppages.NUM_WORDS(DES_COMMENTS)),SUM(stoppages.NUM_WORDS(DES_DESCRIPTION)),SUM(stoppages.NUM_WORDS(DES_WO_NAME)),label,TURBINE_PI_ID_TA00,TURBINE_PI_ID_TA01,TURBINE_PI_ID_TA02,TURBINE_PI_ID_TA03,TURBINE_PI_ID_TA04,TURBINE_PI_ID_TA05,TURBINE_PI_ID_TA06,TURBINE_PI_ID_TA07,TURBINE_PI_ID_TA08,TURBINE_PI_ID_TA09,TURBINE_PI_ID_TA10,TURBINE_PI_ID_TA11,TURBINE_PI_ID_TA12,TURBINE_PI_ID_TA13,TURBINE_PI_ID_TA14,TURBINE_PI_ID_TA15,TURBINE_LOCAL_ID_A0,TURBINE_LOCAL_ID_A1,TURBINE_LOCAL_ID_A10,TURBINE_LOCAL_ID_A11,TURBINE_LOCAL_ID_A12,TURBINE_LOCAL_ID_A13,TURBINE_LOCAL_ID_A14,TURBINE_LOCAL_ID_A15,TURBINE_LOCAL_ID_A2,TURBINE_LOCAL_ID_A3,TURBINE_LOCAL_ID_A4,TURBINE_LOCAL_ID_A5,TURBINE_LOCAL_ID_A6,TURBINE_LOCAL_ID_A7,TURBINE_LOCAL_ID_A8,TURBINE_LOCAL_ID_A9,TURBINE_SAP_COD_LOC000,TURBINE_SAP_COD_LOC001,TURBINE_SAP_COD_LOC002,TURBINE_SAP_COD_LOC003,TURBINE_SAP_COD_LOC004,TURBINE_SAP_COD_LOC005,TURBINE_SAP_COD_LOC006,TURBINE_SAP_COD_LOC007,TURBINE_SAP_COD_LOC008,TURBINE_SAP_COD_LOC009,TURBINE_SAP_COD_LOC010,TURBINE_SAP_COD_LOC011,TURBINE_SAP_COD_LOC012,TURBINE_SAP_COD_LOC013,TURBINE_SAP_COD_LOC014,TURBINE_SAP_COD_LOC015,DES_CORE_ELEMENT_T00,DES_CORE_ELEMENT_T01,DES_CORE_ELEMENT_T02,DES_CORE_ELEMENT_T03,DES_CORE_ELEMENT_T04,DES_CORE_ELEMENT_T05,DES_CORE_ELEMENT_T06,DES_CORE_ELEMENT_T07,DES_CORE_ELEMENT_T08,DES_CORE_ELEMENT_T09,DES_CORE_ELEMENT_T10,DES_CORE_ELEMENT_T11,DES_CORE_ELEMENT_T12,DES_CORE_ELEMENT_T13,DES_CORE_ELEMENT_T14,DES_CORE_ELEMENT_T15,SITE_LOCATION,DES_CORE_PLANT_LOC,COD_PLANT_SAP_ABC,COD_PLANT_SAP_XYZ,PI_COLLECTOR_SITE_NAME_LOC0,PI_LOCAL_SITE_NAME_LOC0
+1,,,0.0,1,12345.0,,,12345.0,,,12345.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,True,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1
+0,,,0.0,1,37452.0,,,37452.0,,,37452.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1
+0,,,0.0,1,23432.0,,,23432.0,,,23432.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,True,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1
+0,,,0.0,1,12452.0,,,12452.0,,,12452.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1
+0,,,0.0,1,32435.0,,,32435.0,,,32435.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,True,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1
+0,,,0.0,1,23534.0,,,23534.0,,,23534.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1
+0,,,0.0,1,65431.0,,,65431.0,,,65431.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,1,1
+0,,,0.0,1,35742.0,,,35742.0,,,35742.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1,1,1
+0,,,0.0,1,21343.0,,,21343.0,,,21343.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,1
+0,,,0.0,1,43565.0,,,43565.0,,,43565.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,1,1
+0,,,0.0,1,24525.0,,,24525.0,,,24525.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1
+0,,,0.0,1,67432.0,,,67432.0,,,67432.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,1,1
+0,,,0.0,1,21342.0,,,21342.0,,,21342.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,1
diff --git a/notebooks/modeling.ipynb b/notebooks/modeling.ipynb
@@ -0,0 +1,203 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d4f4e2b7",
+   "metadata": {},
+   "source": [
+    "# Modeling\n",
+    "\n",
+    "In this tutorial, we will show you how to use `zephyr_ml` to train models using the `Zephyr` class. This tutorial builds on top of the previous one where we create EntitySets, generate label times, and do automated feature engineering. To do any of these previous steps, please refer to `feature_engineering` notebook.\n",
+    "\n",
+    "## 1) Load the Feature Matrix\n",
+    "\n",
+    "Load the feature matrix which is the result of the `feature_engineering` notebook. For the purpose of this tutorial, we use a dummy feature matrix stored in the `data/` folder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "4a6724ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "feature_matrix = pd.read_csv('data/feature_matrix.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "02e2c90a",
+   "metadata": {},
+   "source": [
+    "## 2) Preparing Model Inputs\n",
+    "\n",
+    "Prepare the data for modeling. Depending on the data, you might need to: normalize the data, impute missing values, create one-hot encodings for categorical values, etc.\n",
+    "\n",
+    "In this part of the notebook, we do the following:\n",
+    "* create `X` and `y` variables from the feature matrix\n",
+    "* impute missing values using a SimpleImpute\n",
+    "* split the data into training and testing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "20da6581",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.impute import SimpleImputer\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# pop the target labels\n",
+    "y = list(feature_matrix.pop('label'))\n",
+    "X = feature_matrix.values\n",
+    "\n",
+    "# impute missing values\n",
+    "imputer = SimpleImputer()\n",
+    "X = imputer.fit_transform(X)\n",
+    "\n",
+    "# create train and test splits\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "32afe1aa",
+   "metadata": {},
+   "source": [
+    "## 3) Train a Model\n",
+    "\n",
+    "We train a model using the `Zephyr` interface where you can train, infer, and evaluate a pipeline.\n",
+    "In this notebook, we use an `xgb` pipeline which consists of two primitives:\n",
+    "\n",
+    "```\n",
+    "        \"xgboost.XGBClassifier\"\n",
+    "        \"zephyr_ml.primitives.postprocessing.FindThreshold\"\n",
+    "```\n",
+    "\n",
+    "An `XGBClassifier` primitive is an XGB model that returns the probability of each class, and `FindThreshold` primitive creates binary labels from the output of the XGB model by choosing a threshold that produces the best metric value (F1 Score by default)\n",
+    "\n",
+    "To use a pipeline, we simply pass the name of the pipeline to `Zephyr`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "b02986d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from zephyr_ml import Zephyr\n",
+    "\n",
+    "zephyr = Zephyr('xgb')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1297396",
+   "metadata": {},
+   "source": [
+    "Then, training a pipeline can be done using the `fit` function and passing the training data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "442c5258",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "zephyr.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d4bf2cc",
+   "metadata": {},
+   "source": [
+    "Now that the pipeline is trained, we can use it to predict the values of the test data using `predict` function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "83814cd8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[1, 0, 1]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "zephyr.predict(X_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15f257eb",
+   "metadata": {},
+   "source": [
+    "Lastly, we can evaluate the performance of the pipeline using `evaluate` function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "191a123a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "accuracy     0.666667\n",
+       "f1           0.666667\n",
+       "recall       1.000000\n",
+       "precision    0.500000\n",
+       "dtype: float64"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "zephyr.evaluate(X_test, y_test)"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "2d6fabd7bf745a21519616ebdce3b2479184204dadf576aa19f086ff78438203"
+  },
+  "kernelspec": {
+   "display_name": "zephyr",
+   "language": "python",
+   "name": "zephyr"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/setup.py b/setup.py
@@ -22,6 +22,8 @@
     'pandas>=1,<2',
     'composeml>=0.1.6,<1.0',
     'featuretools>=1.0.0,<2.0.0',
+    'mlblocks>=0.4.0,<0.5',
+    'xgboost>=0.72.1,<1',
     'jupyter==1.0.0',
 ]
 
@@ -83,6 +85,12 @@
         'Programming Language :: Python :: 3.8',
     ],
     description='Prediction engineering methods for Draco.',
+    entry_points={
+        'mlblocks': [
+            'primitives=zephyr_ml:MLBLOCKS_PRIMITIVES',
+            'pipelines=zephyr_ml:MLBLOCKS_PIPELINES'
+        ],
+    },
     extras_require={
         'test': tests_require,
         'dev': development_requires + tests_require,

diff --git a/tests/primitives/test_postprocessing.py b/tests/primitives/test_postprocessing.py
@@ -0,0 +1,32 @@
+from unittest import TestCase
+
+import numpy as np
+
+from zephyr_ml.primitives.postprocessing import FindThreshold
+
+
+class FindThresholdTest(TestCase):
+
+    y = np.array([1, 1, 0, 0, 1, 1, 1, 0])
+    y_hat_1d = np.array([0.8, 0.9, 0.6, 0.5, 0.85, 0.7, 0.95, 0.2])
+    y_hat_2d = np.array([[0.2, 0.8],
+                         [0.1, 0.9],
+                         [0.4, 0.6],
+                         [0.5, 0.5],
+                         [0.15, 0.85],
+                         [0.3, 0.7],
+                         [0.05, 0.95],
+                         [0.8, 0.2]])
+
+    def _run(self, y, y_hat, value):
+        threshold = FindThreshold()
+        threshold.fit(y, y_hat)
+
+        assert threshold._threshold == value
+        np.testing.assert_allclose(threshold.apply_threshold(y_hat), y)
+
+    def test_1d(self):
+        self._run(self.y, self.y_hat_1d, 0.6)
+
+    def test_2d(self):
+        self._run(self.y, self.y_hat_2d, 0.6)
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -0,0 +1,99 @@
+import os
+
+import numpy as np
+import pandas as pd
+
+from zephyr_ml.core import Zephyr
+
+
+class TestZephyr:
+
+    @classmethod
+    def setup_class(cls):
+        cls.train = pd.DataFrame({
+            'feature 1': np.random.random(300),
+            'feature 2': [0] * 150 + [1] * 150,
+        })
+        cls.train_y = cls.train['feature 2'].to_list()
+
+        cls.test = pd.DataFrame({
+            'feature 1': np.random.random((100)),
+            'feature 2': [0] * 25 + [1] * 50 + [0] * 25,
+        })
+        cls.test_y = cls.test['feature 2'].to_list()
+
+        cls.random = pd.DataFrame({
+            'feature 1': list(range(100)),
+            'feature 2': np.random.random(100),
+            'feature 3': np.random.random(100),
+        })
+        cls.random_y = [1 if x > 0.5 else 0 for x in np.random.random(100)]
+
+    def setup(self):
+        self.zephyr = Zephyr('xgb')
+
+    def test_fit(self):
+        self.zephyr.fit(self.train, self.train_y)
+
+    def test_predict(self):
+        self.zephyr.fit(self.train, self.train_y)
+
+        predicted = self.zephyr.predict(self.test)
+
+        assert self.test_y == predicted
+
+    def test_fit_predict(self):
+        predicted = self.zephyr.fit_predict(self.random, self.random_y)
+
+        assert isinstance(predicted, list)
+
+    def test_save_load(self, tmpdir):
+        path = os.path.join(tmpdir, 'some/path.pkl')
+        self.zephyr.save(path)
+
+        new_zephyr = Zephyr.load(path)
+        assert new_zephyr == self.zephyr
+
+    def test_evaluate(self):
+        self.zephyr.fit(self.test, self.test_y)
+        scores = self.zephyr.evaluate(X=self.test, y=self.test_y)
+
+        expected = pd.Series({
+            'accuracy': 1.0,
+            'f1': 1.0,
+            'recall': 1.0,
+            'precision': 1.0,
+        })
+        pd.testing.assert_series_equal(expected, scores)
+
+    def test_evaluate_fit(self):
+        scores = self.zephyr.evaluate(
+            X=self.test,
+            y=self.test_y,
+            fit=True,
+        )
+
+        expected = pd.Series({
+            'accuracy': 1.0,
+            'f1': 1.0,
+            'recall': 1.0,
+            'precision': 1.0,
+        })
+        pd.testing.assert_series_equal(expected, scores)
+
+    def test_evaluate_train_data(self):
+        scores = self.zephyr.evaluate(
+            X=self.test,
+            y=self.test_y,
+            fit=True,
+            train_X=self.train,
+            train_y=self.train_y
+        )
+
+        expected = pd.Series({
+            'accuracy': 1.0,
+            'f1': 1.0,
+            'recall': 1.0,
+            'precision': 1.0,
+        })
+        pd.testing.assert_series_equal(expected, scores)
diff --git a/zephyr_ml/__init__.py b/zephyr_ml/__init__.py
@@ -6,5 +6,11 @@
 __email__ = '[email protected]'
 __version__ = '0.2.4.dev0'
 
+import os
+
+from zephyr_ml.core import Zephyr
 from zephyr_ml.entityset import create_pidata_entityset, create_scada_entityset
 from zephyr_ml.labeling import DataLabeler
+
+MLBLOCKS_PRIMITIVES = os.path.join(os.path.dirname(__file__), 'primitives', 'jsons')
+MLBLOCKS_PIPELINES = os.path.join(os.path.dirname(__file__), 'pipelines')