Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions notebooks/data/feature_matrix.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
COUNT(alarms),MAX(alarms.IND_DURATION),MIN(alarms.IND_DURATION),SUM(alarms.IND_DURATION),COUNT(stoppages),MAX(stoppages.COD_WO),MAX(stoppages.IND_DURATION),MAX(stoppages.IND_LOST_GEN),MIN(stoppages.COD_WO),MIN(stoppages.IND_DURATION),MIN(stoppages.IND_LOST_GEN),SUM(stoppages.COD_WO),SUM(stoppages.IND_DURATION),SUM(stoppages.IND_LOST_GEN),COUNT(scada),MAX(scada.val1),MAX(scada.val2),MIN(scada.val1),MIN(scada.val2),SUM(scada.val1),SUM(scada.val2),MAX(stoppages.NUM_WORDS(DES_COMMENTS)),MAX(stoppages.NUM_WORDS(DES_DESCRIPTION)),MAX(stoppages.NUM_WORDS(DES_WO_NAME)),MIN(stoppages.NUM_WORDS(DES_COMMENTS)),MIN(stoppages.NUM_WORDS(DES_DESCRIPTION)),MIN(stoppages.NUM_WORDS(DES_WO_NAME)),SUM(stoppages.NUM_WORDS(DES_COMMENTS)),SUM(stoppages.NUM_WORDS(DES_DESCRIPTION)),SUM(stoppages.NUM_WORDS(DES_WO_NAME)),label,TURBINE_PI_ID_TA00,TURBINE_PI_ID_TA01,TURBINE_PI_ID_TA02,TURBINE_PI_ID_TA03,TURBINE_PI_ID_TA04,TURBINE_PI_ID_TA05,TURBINE_PI_ID_TA06,TURBINE_PI_ID_TA07,TURBINE_PI_ID_TA08,TURBINE_PI_ID_TA09,TURBINE_PI_ID_TA10,TURBINE_PI_ID_TA11,TURBINE_PI_ID_TA12,TURBINE_PI_ID_TA13,TURBINE_PI_ID_TA14,TURBINE_PI_ID_TA15,TURBINE_LOCAL_ID_A0,TURBINE_LOCAL_ID_A1,TURBINE_LOCAL_ID_A10,TURBINE_LOCAL_ID_A11,TURBINE_LOCAL_ID_A12,TURBINE_LOCAL_ID_A13,TURBINE_LOCAL_ID_A14,TURBINE_LOCAL_ID_A15,TURBINE_LOCAL_ID_A2,TURBINE_LOCAL_ID_A3,TURBINE_LOCAL_ID_A4,TURBINE_LOCAL_ID_A5,TURBINE_LOCAL_ID_A6,TURBINE_LOCAL_ID_A7,TURBINE_LOCAL_ID_A8,TURBINE_LOCAL_ID_A9,TURBINE_SAP_COD_LOC000,TURBINE_SAP_COD_LOC001,TURBINE_SAP_COD_LOC002,TURBINE_SAP_COD_LOC003,TURBINE_SAP_COD_LOC004,TURBINE_SAP_COD_LOC005,TURBINE_SAP_COD_LOC006,TURBINE_SAP_COD_LOC007,TURBINE_SAP_COD_LOC008,TURBINE_SAP_COD_LOC009,TURBINE_SAP_COD_LOC010,TURBINE_SAP_COD_LOC011,TURBINE_SAP_COD_LOC012,TURBINE_SAP_COD_LOC013,TURBINE_SAP_COD_LOC014,TURBINE_SAP_COD_LOC015,DES_CORE_ELEMENT_T00,DES_CORE_ELEMENT_T01,DES_CORE_ELEMENT_T02,DES_CORE_ELEMENT_T03,DES_CORE_ELEMENT_T04,DES_CORE_ELEMENT_T05,DES_CORE_ELEMENT_T06,DES_CORE_ELEMENT_T07,DES_CORE_ELEMENT_T08,DES_CORE_ELEMENT_T09,DES_CORE_ELEMENT_T10,DES_CORE_ELEMENT_T11,DES_CORE_ELEMENT_T12,DES_CORE_ELEMENT_T13,DES_CORE_ELEMENT_T14,DES_CORE_ELEMENT_T15,SITE_LOCATION,DES_CORE_PLANT_LOC,COD_PLANT_SAP_ABC,COD_PLANT_SAP_XYZ,PI_COLLECTOR_SITE_NAME_LOC0,PI_LOCAL_SITE_NAME_LOC0
1,,,0.0,1,12345.0,,,12345.0,,,12345.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,True,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1
0,,,0.0,1,37452.0,,,37452.0,,,37452.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1
0,,,0.0,1,23432.0,,,23432.0,,,23432.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,True,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1
0,,,0.0,1,12452.0,,,12452.0,,,12452.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1
0,,,0.0,1,32435.0,,,32435.0,,,32435.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,True,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1
0,,,0.0,1,23534.0,,,23534.0,,,23534.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1
0,,,0.0,1,65431.0,,,65431.0,,,65431.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,1,1
0,,,0.0,1,35742.0,,,35742.0,,,35742.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1,1,1
0,,,0.0,1,21343.0,,,21343.0,,,21343.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,1
0,,,0.0,1,43565.0,,,43565.0,,,43565.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,1,1
0,,,0.0,1,24525.0,,,24525.0,,,24525.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1
0,,,0.0,1,67432.0,,,67432.0,,,67432.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,1,1
0,,,0.0,1,21342.0,,,21342.0,,,21342.0,0.0,0.0,0,,,,,0.0,0.0,4.0,2.0,3.0,4.0,2.0,3.0,4.0,2.0,3.0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,1
203 changes: 203 additions & 0 deletions notebooks/modeling.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "d4f4e2b7",
"metadata": {},
"source": [
"# Modeling\n",
"\n",
"In this tutorial, we will show you how to use `zephyr_ml` to train models using the `Zephyr` class. This tutorial builds on top of the previous one where we create EntitySets, generate label times, and do automated feature engineering. To do any of these previous steps, please refer to `feature_engineering` notebook.\n",
"\n",
"## 1) Load the Feature Matrix\n",
"\n",
"Load the feature matrix which is the result of the `feature_engineering` notebook. For the purpose of this tutorial, we use a dummy feature matrix stored in the `data/` folder."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "4a6724ad",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"feature_matrix = pd.read_csv('data/feature_matrix.csv')"
]
},
{
"cell_type": "markdown",
"id": "02e2c90a",
"metadata": {},
"source": [
"## 2) Preparing Model Inputs\n",
"\n",
"Prepare the data for modeling. Depending on the data, you might need to: normalize the data, impute missing values, create one-hot encodings for categorical values, etc.\n",
"\n",
"In this part of the notebook, we do the following:\n",
"* create `X` and `y` variables from the feature matrix\n",
"* impute missing values using a SimpleImpute\n",
"* split the data into training and testing"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "20da6581",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.impute import SimpleImputer\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# pop the target labels\n",
"y = list(feature_matrix.pop('label'))\n",
"X = feature_matrix.values\n",
"\n",
"# impute missing values\n",
"imputer = SimpleImputer()\n",
"X = imputer.fit_transform(X)\n",
"\n",
"# create train and test splits\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)"
]
},
{
"cell_type": "markdown",
"id": "32afe1aa",
"metadata": {},
"source": [
"## 3) Train a Model\n",
"\n",
"We train a model using the `Zephyr` interface where you can train, infer, and evaluate a pipeline.\n",
"In this notebook, we use an `xgb` pipeline which consists of two primitives:\n",
"\n",
"```\n",
" \"xgboost.XGBClassifier\"\n",
" \"zephyr_ml.primitives.postprocessing.FindThreshold\"\n",
"```\n",
"\n",
"An `XGBClassifier` primitive is an XGB model that returns the probability of each class, and `FindThreshold` primitive creates binary labels from the output of the XGB model by choosing a threshold that produces the best metric value (F1 Score by default)\n",
"\n",
"To use a pipeline, we simply pass the name of the pipeline to `Zephyr`"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b02986d9",
"metadata": {},
"outputs": [],
"source": [
"from zephyr_ml import Zephyr\n",
"\n",
"zephyr = Zephyr('xgb')"
]
},
{
"cell_type": "markdown",
"id": "a1297396",
"metadata": {},
"source": [
"Then, training a pipeline can be done using the `fit` function and passing the training data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "442c5258",
"metadata": {},
"outputs": [],
"source": [
"zephyr.fit(X_train, y_train)"
]
},
{
"cell_type": "markdown",
"id": "8d4bf2cc",
"metadata": {},
"source": [
"Now that the pipeline is trained, we can use it to predict the values of the test data using `predict` function"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "83814cd8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[1, 0, 1]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"zephyr.predict(X_test)"
]
},
{
"cell_type": "markdown",
"id": "15f257eb",
"metadata": {},
"source": [
"Lastly, we can evaluate the performance of the pipeline using `evaluate` function"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "191a123a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"accuracy 0.666667\n",
"f1 0.666667\n",
"recall 1.000000\n",
"precision 0.500000\n",
"dtype: float64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"zephyr.evaluate(X_test, y_test)"
]
}
],
"metadata": {
"interpreter": {
"hash": "2d6fabd7bf745a21519616ebdce3b2479184204dadf576aa19f086ff78438203"
},
"kernelspec": {
"display_name": "zephyr",
"language": "python",
"name": "zephyr"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.15"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
8 changes: 8 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
'pandas>=1,<2',
'composeml>=0.1.6,<1.0',
'featuretools>=1.0.0,<2.0.0',
'mlblocks>=0.4.0,<0.5',
'xgboost>=0.72.1,<1',
'jupyter==1.0.0',
]

Expand Down Expand Up @@ -83,6 +85,12 @@
'Programming Language :: Python :: 3.8',
],
description='Prediction engineering methods for Draco.',
entry_points={
'mlblocks': [
'primitives=zephyr_ml:MLBLOCKS_PRIMITIVES',
'pipelines=zephyr_ml:MLBLOCKS_PIPELINES'
],
},
extras_require={
'test': tests_require,
'dev': development_requires + tests_require,
Expand Down
32 changes: 32 additions & 0 deletions tests/primitives/test_postprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from unittest import TestCase

import numpy as np

from zephyr_ml.primitives.postprocessing import FindThreshold


class FindThresholdTest(TestCase):

y = np.array([1, 1, 0, 0, 1, 1, 1, 0])
y_hat_1d = np.array([0.8, 0.9, 0.6, 0.5, 0.85, 0.7, 0.95, 0.2])
y_hat_2d = np.array([[0.2, 0.8],
[0.1, 0.9],
[0.4, 0.6],
[0.5, 0.5],
[0.15, 0.85],
[0.3, 0.7],
[0.05, 0.95],
[0.8, 0.2]])

def _run(self, y, y_hat, value):
threshold = FindThreshold()
threshold.fit(y, y_hat)

assert threshold._threshold == value
np.testing.assert_allclose(threshold.apply_threshold(y_hat), y)

def test_1d(self):
self._run(self.y, self.y_hat_1d, 0.6)

def test_2d(self):
self._run(self.y, self.y_hat_2d, 0.6)
99 changes: 99 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import os

import numpy as np
import pandas as pd

from zephyr_ml.core import Zephyr


class TestZephyr:

@classmethod
def setup_class(cls):
cls.train = pd.DataFrame({
'feature 1': np.random.random(300),
'feature 2': [0] * 150 + [1] * 150,
})
cls.train_y = cls.train['feature 2'].to_list()

cls.test = pd.DataFrame({
'feature 1': np.random.random((100)),
'feature 2': [0] * 25 + [1] * 50 + [0] * 25,
})
cls.test_y = cls.test['feature 2'].to_list()

cls.random = pd.DataFrame({
'feature 1': list(range(100)),
'feature 2': np.random.random(100),
'feature 3': np.random.random(100),
})
cls.random_y = [1 if x > 0.5 else 0 for x in np.random.random(100)]

def setup(self):
self.zephyr = Zephyr('xgb')

def test_fit(self):
self.zephyr.fit(self.train, self.train_y)

def test_predict(self):
self.zephyr.fit(self.train, self.train_y)

predicted = self.zephyr.predict(self.test)

assert self.test_y == predicted

def test_fit_predict(self):
predicted = self.zephyr.fit_predict(self.random, self.random_y)

assert isinstance(predicted, list)

def test_save_load(self, tmpdir):
path = os.path.join(tmpdir, 'some/path.pkl')
self.zephyr.save(path)

new_zephyr = Zephyr.load(path)
assert new_zephyr == self.zephyr

def test_evaluate(self):
self.zephyr.fit(self.test, self.test_y)
scores = self.zephyr.evaluate(X=self.test, y=self.test_y)

expected = pd.Series({
'accuracy': 1.0,
'f1': 1.0,
'recall': 1.0,
'precision': 1.0,
})
pd.testing.assert_series_equal(expected, scores)

def test_evaluate_fit(self):
scores = self.zephyr.evaluate(
X=self.test,
y=self.test_y,
fit=True,
)

expected = pd.Series({
'accuracy': 1.0,
'f1': 1.0,
'recall': 1.0,
'precision': 1.0,
})
pd.testing.assert_series_equal(expected, scores)

def test_evaluate_train_data(self):
scores = self.zephyr.evaluate(
X=self.test,
y=self.test_y,
fit=True,
train_X=self.train,
train_y=self.train_y
)

expected = pd.Series({
'accuracy': 1.0,
'f1': 1.0,
'recall': 1.0,
'precision': 1.0,
})
pd.testing.assert_series_equal(expected, scores)
6 changes: 6 additions & 0 deletions zephyr_ml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,11 @@
__email__ = '[email protected]'
__version__ = '0.2.4.dev0'

import os

from zephyr_ml.core import Zephyr
from zephyr_ml.entityset import create_pidata_entityset, create_scada_entityset
from zephyr_ml.labeling import DataLabeler

MLBLOCKS_PRIMITIVES = os.path.join(os.path.dirname(__file__), 'primitives', 'jsons')
MLBLOCKS_PIPELINES = os.path.join(os.path.dirname(__file__), 'pipelines')
Loading